diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04d19cf286..08686d5fe8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -205,16 +205,17 @@ endif()
 OCV_OPTION(OPENCV_ENABLE_NONFREE "Enable non-free algorithms" OFF)
 
 # 3rd party libs
-OCV_OPTION(BUILD_ZLIB               "Build zlib from source"             WIN32 OR APPLE)
-OCV_OPTION(BUILD_TIFF               "Build libtiff from source"          WIN32 OR ANDROID OR APPLE)
-OCV_OPTION(BUILD_JASPER             "Build libjasper from source"        WIN32 OR ANDROID OR APPLE)
-OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"          WIN32 OR ANDROID OR APPLE)
-OCV_OPTION(BUILD_PNG                "Build libpng from source"           WIN32 OR ANDROID OR APPLE)
-OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"          (WIN32 OR ANDROID OR APPLE) AND NOT WINRT)
-OCV_OPTION(BUILD_WEBP               "Build WebP from source"             (WIN32 OR ANDROID OR APPLE) AND NOT WINRT)
-OCV_OPTION(BUILD_TBB                "Download and build TBB from source" ANDROID )
-OCV_OPTION(BUILD_IPP_IW             "Build IPP IW from source"           NOT MINGW IF (X86_64 OR X86) AND NOT WINRT )
-OCV_OPTION(BUILD_ITT                "Build Intel ITT from source"        NOT MINGW IF (X86_64 OR X86) AND NOT WINRT AND NOT APPLE_FRAMEWORK )
+OCV_OPTION(OPENCV_FORCE_3RDPARTY_BUILD   "Force using 3rdparty code from source" OFF)
+OCV_OPTION(BUILD_ZLIB               "Build zlib from source"             (WIN32 OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_TIFF               "Build libtiff from source"          (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_JASPER             "Build libjasper from source"        (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_JPEG               "Build libjpeg from source"          (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_PNG                "Build libpng from source"           (WIN32 OR ANDROID OR APPLE OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_OPENEXR            "Build openexr from source"          (((WIN32 OR ANDROID OR APPLE) AND NOT WINRT) OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_WEBP               "Build WebP from source"             (((WIN32 OR ANDROID OR APPLE) AND NOT WINRT) OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_TBB                "Download and build TBB from source" (ANDROID OR OPENCV_FORCE_3RDPARTY_BUILD) )
+OCV_OPTION(BUILD_IPP_IW             "Build IPP IW from source"           (NOT MINGW OR OPENCV_FORCE_3RDPARTY_BUILD) IF (X86_64 OR X86) AND NOT WINRT )
+OCV_OPTION(BUILD_ITT                "Build Intel ITT from source"        (NOT MINGW OR OPENCV_FORCE_3RDPARTY_BUILD) IF (X86_64 OR X86) AND NOT WINRT AND NOT APPLE_FRAMEWORK )
 
 # Optional 3rd party components
 # ===================================================
@@ -339,6 +340,7 @@ OCV_OPTION(ENABLE_BUILD_HARDENING     "Enable hardening of the resulting binarie
 OCV_OPTION(ENABLE_LTO                 "Enable Link Time Optimization" OFF IF CV_GCC OR MSVC)
 OCV_OPTION(ENABLE_THIN_LTO            "Enable Thin LTO" OFF IF CV_CLANG)
 OCV_OPTION(GENERATE_ABI_DESCRIPTOR    "Generate XML file for abi_compliance_checker tool" OFF IF UNIX)
+OCV_OPTION(OPENCV_GENERATE_PKGCONFIG  "Generate .pc file for pkg-config build tool (deprecated)" ON IF (UNIX AND NOT MSVC AND NOT IOS AND NOT ANDROID) )
 OCV_OPTION(CV_ENABLE_INTRINSICS       "Use intrinsic-based optimized code" ON )
 OCV_OPTION(CV_DISABLE_OPTIMIZATION    "Disable explicit optimized code (dispatched code/intrinsics/loop unrolling/etc)" OFF )
 OCV_OPTION(CV_TRACE                   "Enable OpenCV code trace" ON)
@@ -856,6 +858,7 @@ include(cmake/OpenCVGenHeaders.cmake)
 
 # Generate opencv.pc for pkg-config command
 if(NOT OPENCV_SKIP_PKGCONFIG_GENERATION
+    AND OPENCV_GENERATE_PKGCONFIG
     AND NOT CMAKE_GENERATOR MATCHES "Xcode")
   include(cmake/OpenCVGenPkgconfig.cmake)
 endif()
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index b622cd554f..1a0e292e57 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -605,10 +605,12 @@ macro(OCV_OPTION variable description value)
       option(${variable} "${description}" ${__value})
     endif()
   else()
-    if(DEFINED ${variable})
-      # TODO: message(WARNING "Option will be ignored: ${variable} (=${${variable}})")
+    if(DEFINED ${variable} AND NOT OPENCV_HIDE_WARNING_UNSUPPORTED_OPTION)
+      message(WARNING "Unexpected option: ${variable} (=${${variable}})\nCondition: IF (${__condition})")
+    endif()
+    if(OPENCV_UNSET_UNSUPPORTED_OPTION)
+      unset(${variable} CACHE)
     endif()
-    unset(${variable} CACHE)
   endif()
   unset(__condition)
   unset(__value)
diff --git a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
index 58273241d2..f9d70938eb 100644
--- a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
+++ b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
@@ -81,8 +81,8 @@ points.
 Now an orientation is assigned to each keypoint to achieve invariance to image rotation. A
 neighbourhood is taken around the keypoint location depending on the scale, and the gradient
 magnitude and direction is calculated in that region. An orientation histogram with 36 bins covering
-360 degrees is created. (It is weighted by gradient magnitude and gaussian-weighted circular window
-with \f$\sigma\f$ equal to 1.5 times the scale of keypoint. The highest peak in the histogram is taken
+360 degrees is created (It is weighted by gradient magnitude and gaussian-weighted circular window
+with \f$\sigma\f$ equal to 1.5 times the scale of keypoint). The highest peak in the histogram is taken
 and any peak above 80% of it is also considered to calculate the orientation. It creates keypoints
 with same location and scale, but different directions. It contribute to stability of matching.
 
@@ -99,7 +99,7 @@ illumination changes, rotation etc.
 Keypoints between two images are matched by identifying their nearest neighbours. But in some cases,
 the second closest-match may be very near to the first. It may happen due to noise or some other
 reasons. In that case, ratio of closest-distance to second-closest distance is taken. If it is
-greater than 0.8, they are rejected. It eliminaters around 90% of false matches while discards only
+greater than 0.8, they are rejected. It eliminates around 90% of false matches while discards only
 5% correct matches, as per the paper.
 
 So this is a summary of SIFT algorithm. For more details and understanding, reading the original
diff --git a/doc/py_tutorials/py_video/py_bg_subtraction/py_bg_subtraction.markdown b/doc/py_tutorials/py_video/py_bg_subtraction/py_bg_subtraction.markdown
index 5f6cf9695a..4235e91639 100644
--- a/doc/py_tutorials/py_video/py_bg_subtraction/py_bg_subtraction.markdown
+++ b/doc/py_tutorials/py_video/py_bg_subtraction/py_bg_subtraction.markdown
@@ -20,7 +20,7 @@ extract the moving foreground from static background.
 If you have an image of background alone, like an image of the room without visitors, image of the road
 without vehicles etc, it is an easy job. Just subtract the new image from the background. You get
 the foreground objects alone. But in most of the cases, you may not have such an image, so we need
-to extract the background from whatever images we have. It become more complicated when there are
+to extract the background from whatever images we have. It becomes more complicated when there are
 shadows of the vehicles. Since shadows also move, simple subtraction will mark that also as
 foreground. It complicates things.
 
@@ -72,7 +72,7 @@ papers by Z.Zivkovic, "Improved adaptive Gaussian mixture model for background s
 and "Efficient Adaptive Density Estimation per Image Pixel for the Task of Background Subtraction"
 in 2006. One important feature of this algorithm is that it selects the appropriate number of
 gaussian distribution for each pixel. (Remember, in last case, we took a K gaussian distributions
-throughout the algorithm). It provides better adaptibility to varying scenes due illumination
+throughout the algorithm). It provides better adaptability to varying scenes due illumination
 changes etc.
 
 As in previous case, we have to create a background subtractor object. Here, you have an option of
diff --git a/doc/py_tutorials/py_video/py_lucas_kanade/py_lucas_kanade.markdown b/doc/py_tutorials/py_video/py_lucas_kanade/py_lucas_kanade.markdown
index 0c63e35eb0..61abdd4889 100644
--- a/doc/py_tutorials/py_video/py_lucas_kanade/py_lucas_kanade.markdown
+++ b/doc/py_tutorials/py_video/py_lucas_kanade/py_lucas_kanade.markdown
@@ -75,10 +75,10 @@ solution.
 ( Check similarity of inverse matrix with Harris corner detector. It denotes that corners are better
 points to be tracked.)
 
-So from user point of view, idea is simple, we give some points to track, we receive the optical
+So from the user point of view, the idea is simple, we give some points to track, we receive the optical
 flow vectors of those points. But again there are some problems. Until now, we were dealing with
-small motions. So it fails when there is large motion. So again we go for pyramids. When we go up in
-the pyramid, small motions are removed and large motions becomes small motions. So applying
+small motions, so it fails when there is a large motion. To deal with this we use pyramids. When we go up in
+the pyramid, small motions are removed and large motions become small motions. So by applying
 Lucas-Kanade there, we get optical flow along with the scale.
 
 Lucas-Kanade Optical Flow in OpenCV
diff --git a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
index c85503773b..11d4d861c1 100644
--- a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
+++ b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.markdown
@@ -69,7 +69,7 @@ to an integer format. Then we use a simple look and the upper formula to calcula
 No OpenCV specific stuff here.
 
 Another issue is how do we measure time? Well OpenCV offers two simple functions to achieve this
-@ref cv::getTickCount() and @ref cv::getTickFrequency() . The first returns the number of ticks of
+cv::getTickCount() and cv::getTickFrequency() . The first returns the number of ticks of
 your systems CPU from a certain event (like since you booted your system). The second returns how
 many times your CPU emits a tick during a second. So to measure in seconds the number of time
 elapsed between two operations is easy as:
@@ -98,7 +98,7 @@ example in case of an BGR color system:
 Note that the order of the channels is inverse: BGR instead of RGB. Because in many cases the memory
 is large enough to store the rows in a successive fashion the rows may follow one after another,
 creating a single long row. Because everything is in a single place following one after another this
-may help to speed up the scanning process. We can use the @ref cv::Mat::isContinuous() function to *ask*
+may help to speed up the scanning process. We can use the cv::Mat::isContinuous() function to *ask*
 the matrix if this is the case. Continue on to the next section to find an example.
 
 The efficient way
@@ -155,7 +155,7 @@ elements in the image. Its basic usage is to specify the row and column number o
 to access. During our earlier scanning methods you could already observe that is important through
 what type we are looking at the image. It's no different here as you need to manually specify what
 type to use at the automatic lookup. You can observe this in case of the gray scale images for the
-following source code (the usage of the + @ref cv::at() function):
+following source code (the usage of the + cv::Mat::at() function):
 
 @snippet how_to_scan_images.cpp scan-random
 
@@ -169,12 +169,12 @@ new row pointer for what we use the C operator[] to acquire the column element.
 
 If you need to do multiple lookups using this method for an image it may be troublesome and time
 consuming to enter the type and the at keyword for each of the accesses. To solve this problem
-OpenCV has a @ref cv::Mat_ data type. It's the same as Mat with the extra need that at definition
+OpenCV has a cv::Mat_ data type. It's the same as Mat with the extra need that at definition
 you need to specify the data type through what to look at the data matrix, however in return you can
 use the operator() for fast access of items. To make things even better this is easily convertible
-from and to the usual @ref cv::Mat data type. A sample usage of this you can see in case of the
+from and to the usual cv::Mat data type. A sample usage of this you can see in case of the
 color images of the upper function. Nevertheless, it's important to note that the same operation
-(with the same runtime speed) could have been done with the @ref cv::at() function. It's just a less
+(with the same runtime speed) could have been done with the cv::Mat::at function. It's just a less
 to write for the lazy programmer trick.
 
 The Core Function
@@ -183,7 +183,7 @@ The Core Function
 This is a bonus method of achieving lookup table modification in an image. In image
 processing it's quite common that you want to modify all of a given image values to some other value.
 OpenCV provides a function for modifying image values, without the need to write the scanning logic
-of the image. We use the @ref cv::LUT() function of the core module. First we build a Mat type of the
+of the image. We use the cv::LUT() function of the core module. First we build a Mat type of the
 lookup table:
 
 @snippet how_to_scan_images.cpp table-init
diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index c98a428df0..34f564df0b 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -2228,13 +2228,13 @@ bool findCirclesGrid( InputArray _image, Size patternSize,
       void* oldCbkData;
       ErrorCallback oldCbk = redirectError(quiet_error, 0, &oldCbkData); // FIXIT not thread safe
 #endif
-      CV_TRY
+      try
       {
         isFound = boxFinder.findHoles();
       }
-      CV_CATCH(Exception, e)
+      catch (const cv::Exception &)
       {
-          CV_UNUSED(e);
+
       }
 #if BE_QUIET
       redirectError(oldCbk, oldCbkData);
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 1306c3d9dc..332595897d 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -2,6 +2,7 @@ set(the_description "The Core Functionality")
 
 ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
 ocv_add_dispatched_file(stat SSE4_2 AVX2)
+ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2)
 
 # dispatching for accuracy tests
 ocv_add_dispatched_file_force_all(test_intrin128 TEST SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX FP16 AVX2)
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 6e31b12692..7341598af1 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -152,20 +152,6 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
 
 #define CV_UNUSED(name) (void)name
 
-#if defined __GNUC__ && !defined __EXCEPTIONS
-#define CV_TRY
-#define CV_CATCH(A, B) for (A B; false; )
-#define CV_CATCH_ALL if (false)
-#define CV_THROW(A) abort()
-#define CV_RETHROW() abort()
-#else
-#define CV_TRY try
-#define CV_CATCH(A, B) catch(const A & B)
-#define CV_CATCH_ALL catch(...)
-#define CV_THROW(A) throw A
-#define CV_RETHROW() throw
-#endif
-
 //! @endcond
 
 // undef problematic defines sometimes defined by system headers (windows.h in particular)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
index 54f1c48109..58b3e7fae7 100644
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -661,7 +661,7 @@ inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
 {
     v_uint16x16 c, d;
     v_mul_expand(a, b, c, d);
-    return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
+    return v_pack(c, d);
 }
 inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
 {
@@ -1291,6 +1291,16 @@ inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
 { return v_abs(a - b); }
 
+/** Saturating absolute difference **/
+inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
 ////////// Conversions /////////
 
 /** Rounding **/
@@ -1300,6 +1310,12 @@ inline v_int32x8 v_round(const v_float32x8& a)
 inline v_int32x8 v_round(const v_float64x4& a)
 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
 
+inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
+    return v_int32x8(_v256_combine(ai, bi));
+}
+
 inline v_int32x8 v_trunc(const v_float32x8& a)
 { return v_int32x8(_mm256_cvttps_epi32(a.val)); }
 
@@ -1689,6 +1705,40 @@ void v_rshr_pack_store(int* ptr, const v_int64x4& a)
     v_pack_store(ptr, (a + delta) >> n);
 }
 
+// pack boolean
+inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i ab = _mm256_packs_epi16(a.val, b.val);
+    return v_uint8x32(_v256_shuffle_odd_64(ab));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
+                           const v_uint32x8& c, const v_uint32x8& d)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+
+    __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
+    return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                           const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
+                           const v_uint64x4& g, const v_uint64x4& h)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+    __m256i ef = _mm256_packs_epi32(e.val, f.val);
+    __m256i gh = _mm256_packs_epi32(g.val, h.val);
+
+    __m256i abcd = _mm256_packs_epi32(ab, cd);
+    __m256i efgh = _mm256_packs_epi32(ef, gh);
+    __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
+
+    __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
+    return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
+}
+
 /* Recombine */
 // its up there with load and store operations
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 38a39172d0..5712f167a8 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -109,7 +109,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 
 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
-- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
+- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
 - Extract: @ref v_extract
@@ -159,7 +159,7 @@ Most of these operations return only one value.
 ### Other math
 
 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
-- Absolute values: @ref v_abs, @ref v_absdiff
+- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
 
 ### Conversions
 
@@ -199,10 +199,12 @@ Regular integers:
 |logical            | x | x | x | x | x | x |
 |min, max           | x | x | x | x | x | x |
 |absdiff            | x | x | x | x | x | x |
+|absdiffs           |   | x |   | x |   |   |
 |reduce             |   |   |   |   | x | x |
 |mask               | x | x | x | x | x | x |
 |pack               | x | x | x | x | x | x |
 |pack_u             | x |   | x |   |   |   |
+|pack_b             | x |   |   |   |   |   |
 |unpack             | x | x | x | x | x | x |
 |extract            | x | x | x | x | x | x |
 |rotate (lanes)     | x | x | x | x | x | x |
@@ -762,6 +764,19 @@ inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
     return c;
 }
 
+/** @brief Saturating absolute difference
+
+Returns \f$ saturate(|a - b|) \f$ .
+For 8-, 16-bit signed integer source types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++)
+        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
+    return c;
+}
+
 /** @brief Inversed square root
 
 Returns \f$ 1/sqrt(a) \f$
@@ -1613,6 +1628,18 @@ template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
     return c;
 }
 
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = cvRound(b.s[i]);
+    }
+    return c;
+}
+
 /** @brief Floor
 
 Floor each value. Input type is float vector ==> output type is int vector.*/
@@ -2059,6 +2086,103 @@ OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, s
 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
 //! @}
 
+//! @cond IGNORED
+template<typename _Tpm, typename _Tp, int n>
+inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    for (int i = 0; i < n; ++i)
+    {
+        mptr[i] = (_Tpm)a.s[i];
+        mptr[i + n] = (_Tpm)b.s[i];
+    }
+}
+//! @endcond
+
+//! @name Pack boolean values
+//! @{
+//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
+//!
+//! @note Must provide valid boolean values to guarantee same result for all architectures.
+
+/** @brief
+//! For 16-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
+b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    return mask;
+}
+
+/** @overload
+For 32-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0 0 0xFFFF..}
+b  {0 0xFFFF.. 0xFFFF.. 0}
+c  {0xFFFF.. 0 0xFFFF.. 0}
+d  {0 0xFFFF.. 0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 8, c, d);
+    return mask;
+}
+
+/** @overload
+For 64-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0}
+b  {0 0xFFFF..}
+c  {0xFFFF.. 0}
+d  {0 0xFFFF..}
+
+e  {0xFFFF.. 0}
+f  {0xFFFF.. 0}
+g  {0 0xFFFF..}
+h  {0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0xFF 0 0 0xFF
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v_uint8x16 mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 4, c, d);
+    _pack_b(mask.s + 8, e, f);
+    _pack_b(mask.s + 12, g, h);
+    return mask;
+}
+//! @}
+
 /** @brief Matrix multiplication
 
 Scheme:
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 8c13ad52db..50c9b154ee 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -394,6 +394,35 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn
 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
 
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
+    uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
+    return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
+    uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
+    uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
+    uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
+
+    uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
+    uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
+    return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
+}
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
@@ -748,7 +777,6 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
 
-// TODO: absdiff for signed integers
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
@@ -757,6 +785,12 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
 #endif
 
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
+
 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
 inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1242,6 +1276,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
     return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
 }
 
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
+}
+
 inline v_int32x4 v_floor(const v_float64x2& a)
 {
     static const int32x2_t zero = vdup_n_s32(0);
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index d4740b72fe..c49d0de377 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -634,6 +634,35 @@ void v_rshr_pack_store(int* ptr, const v_int64x2& a)
     _mm_storel_epi64((__m128i*)ptr, a2);
 }
 
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i ab = _mm_packs_epi16(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    return v_uint8x16(_mm_packs_epi16(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    __m128i ef = _mm_packs_epi32(e.val, f.val);
+    __m128i gh = _mm_packs_epi32(g.val, h.val);
+
+    __m128i abcd = _mm_packs_epi32(ab, cd);
+    __m128i efgh = _mm_packs_epi32(ef, gh);
+    return v_uint8x16(_mm_packs_epi16(abcd, efgh));
+}
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
@@ -706,19 +735,11 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
     inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
     { a = a * b; return a; }
 
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8,  v_int32x4)
 
-inline v_uint8x16 operator * (const v_uint8x16& a, const v_uint8x16& b)
-{
-    v_uint16x8 c, d;
-    v_mul_expand(a, b, c, d);
-    return v_pack_u(v_reinterpret_as_s16(c), v_reinterpret_as_s16(d));
-}
-inline v_uint8x16& operator *= (v_uint8x16& a, const v_uint8x16& b)
-{ a = a * b; return a; }
-
 //  Multiply and expand
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
                          v_uint16x8& c, v_uint16x8& d)
@@ -1045,34 +1066,43 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
     return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
 }
 
-#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
-inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
-} \
-inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    __m128i smask = _mm_set1_epi32(smask32); \
-    __m128i a1 = _mm_xor_si128(a.val, smask); \
-    __m128i b1 = _mm_xor_si128(b.val, smask); \
-    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
-}
-
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
+/** Absolute difference **/
 
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_add_wrap(a - b,  b - a); }
 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 {
-    return v_max(a, b) - v_min(a, b);
+    v_int8x16 d = v_sub_wrap(a, b);
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
 }
-
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 {
-    __m128i d = _mm_sub_epi32(a.val, b.val);
-    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
-    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
 }
 
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+ }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
     return a * b + c;
@@ -1623,6 +1653,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
 
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
+    return v_int32x4(_mm_unpacklo_epi64(ai, bi));
+}
+
 inline v_int32x4 v_floor(const v_float64x2& a)
 {
     __m128i a1 = _mm_cvtpd_epi32(a.val);
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 27efd2ad9c..b23e19950e 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -383,6 +383,35 @@ OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
 //OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
 //                         vec_sra, vec_packsu, vec_add, pack_u)
 
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uchar16 ab = vec_pack(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vec_ushort8 ab = vec_pack(a.val, b.val);
+    vec_ushort8 cd = vec_pack(c.val, d.val);
+    return v_uint8x16(vec_pack(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vec_uint4 ab = vec_pack(a.val, b.val);
+    vec_uint4 cd = vec_pack(c.val, d.val);
+    vec_uint4 ef = vec_pack(e.val, f.val);
+    vec_uint4 gh = vec_pack(g.val, h.val);
+
+    vec_ushort8 abcd = vec_pack(ab, cd);
+    vec_ushort8 efgh = vec_pack(ef, gh);
+    return v_uint8x16(vec_pack(abcd, efgh));
+}
+
 /* Recombine */
 template <typename _Tpvec>
 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
@@ -834,16 +863,27 @@ inline v_float32x4 v_abs(const v_float32x4& x)
 inline v_float64x2 v_abs(const v_float64x2& x)
 { return v_float64x2(vec_abs(x.val)); }
 
+/** Absolute difference **/
+// unsigned
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
 
-#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin)  \
-inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b)                       \
-{ return _Tpvec2(cast(intrin(a.val, b.val))); }
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
 
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd)
-OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd)
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(a - b); }
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(a - b); }
+
+/** Absolute difference for signed integers **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
 
 ////////// Conversions /////////
 
@@ -854,6 +894,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 { return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
 
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
+
 inline v_int32x4 v_floor(const v_float32x4& a)
 { return v_int32x4(vec_cts(vec_floor(a.val))); }
 
diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
index bb352b730f..73096aed31 100644
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -180,6 +180,8 @@ T* allocSingleton(size_t count = 1) { return static_cast<T*>(allocSingletonBuffe
 *                     Structures and macros for integration with IPP                     *
 \****************************************************************************************/
 
+#define OPENCV_IPP_REDUCE_SIZE 1
+
 // Temporary disabled named IPP region. Accuracy
 #define IPP_DISABLE_PYRAMIDS_UP         1 // Different results
 #define IPP_DISABLE_PYRAMIDS_DOWN       1 // Different results
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index 5dd164d0ad..ae08a2609e 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -519,6 +519,23 @@ static inline size_t divUp(size_t a, unsigned int b)
     return (a + b - 1) / b;
 }
 
+/** @brief Round first value up to the nearest multiple of second value.
+
+Use this function instead of `ceil((float)a / b) * b` expressions.
+
+@sa divUp
+*/
+static inline int roundUp(int a, unsigned int b)
+{
+    CV_DbgAssert(a >= 0);
+    return a + b - 1 - (a + b -1) % b;
+}
+/** @overload */
+static inline size_t roundUp(size_t a, unsigned int b)
+{
+    return a + b - 1 - (a + b - 1) % b;
+}
+
 /** @brief Enables or disables the optimized code.
 
 The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2,
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 1560618c61..b25c7870ed 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -199,9 +199,9 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
             func = tab[depth1];
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
-        Size sz = getContinuousSize(src1, src2, dst);
+        Size sz = getContinuousSize2D(src1, src2, dst);
         size_t len = sz.width*(size_t)cn;
-        if( len == (size_t)(int)len )
+        if (len < INT_MAX)  // FIXIT similar code below doesn't have that check
         {
             sz.width = (int)len;
             func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, 0);
@@ -630,7 +630,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                           usrdata, oclop, false))
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
-        Size sz = getContinuousSize(src1, src2, dst, src1.channels());
+        Size sz = getContinuousSize2D(src1, src2, dst, src1.channels());
         tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
         return;
     }
@@ -1233,7 +1233,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
         int cn = src1.channels();
         _dst.create(src1.size(), CV_8UC(cn));
         Mat dst = _dst.getMat();
-        Size sz = getContinuousSize(src1, src2, dst, src1.channels());
+        Size sz = getContinuousSize2D(src1, src2, dst, src1.channels());
         getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, &op);
         return;
     }
@@ -2088,1014 +2088,4 @@ cvMaxS( const void* srcarr1, double value, void* dstarr )
     cv::max( src1, value, dst );
 }
 
-
-
-namespace cv { namespace hal {
-
-//=======================================
-
-#if (ARITHM_USE_IPP == 1)
-static inline void fixSteps(int width, int height, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
-{
-    if( height == 1 )
-        step1 = step2 = step = width*elemSize;
-}
-#define CALL_IPP_BIN_E_12(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-
-#define CALL_IPP_BIN_E_21(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-
-#define CALL_IPP_BIN_12(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height))) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-
-#define CALL_IPP_BIN_21(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height))) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-
-#else
-#define CALL_IPP_BIN_E_12(fun)
-#define CALL_IPP_BIN_E_21(fun)
-#define CALL_IPP_BIN_12(fun)
-#define CALL_IPP_BIN_21(fun)
-#endif
-
-
-//=======================================
-// Add
-//=======================================
-
-void add8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add8u, cv_hal_add8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_12(ippiAdd_8u_C1RSfs)
-    (vBinOp<uchar, cv::OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void add8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add8s, cv_hal_add8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void add16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add16u, cv_hal_add16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_12(ippiAdd_16u_C1RSfs)
-    (vBinOp<ushort, cv::OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void add16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add16s, cv_hal_add16s, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_12(ippiAdd_16s_C1RSfs)
-    (vBinOp<short, cv::OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void add32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add32s, cv_hal_add32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void add32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add32f, cv_hal_add32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAdd_32f_C1R)
-    (vBinOp32<float, cv::OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void add64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(add64f, cv_hal_add64f, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp64<double, cv::OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-// Subtract
-//=======================================
-
-void sub8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub8u, cv_hal_sub8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_21(ippiSub_8u_C1RSfs)
-    (vBinOp<uchar, cv::OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void sub8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub8s, cv_hal_sub8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void sub16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub16u, cv_hal_sub16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_21(ippiSub_16u_C1RSfs)
-    (vBinOp<ushort, cv::OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void sub16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub16s, cv_hal_sub16s, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_E_21(ippiSub_16s_C1RSfs)
-    (vBinOp<short, cv::OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void sub32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub32s, cv_hal_sub32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void sub32f( const float* src1, size_t step1,
-                   const float* src2, size_t step2,
-                   float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub32f, cv_hal_sub32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_21(ippiSub_32f_C1R)
-    (vBinOp32<float, cv::OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void sub64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(sub64f, cv_hal_sub64f, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp64<double, cv::OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-
-#if (ARITHM_USE_IPP == 1)
-#define CALL_IPP_MIN_MAX(fun, type) \
-    CV_IPP_CHECK() \
-    { \
-        type* s1 = (type*)src1; \
-        type* s2 = (type*)src2; \
-        type* d  = dst; \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-        int i = 0; \
-        for(; i < height; i++) \
-        { \
-            if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width)) \
-                break; \
-            s1 = (type*)((uchar*)s1 + step1); \
-            s2 = (type*)((uchar*)s2 + step2); \
-            d  = (type*)((uchar*)d + step); \
-        } \
-        if (i == height) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-#else
-#define CALL_IPP_MIN_MAX(fun, type)
-#endif
-
-//=======================================
-// Max
-//=======================================
-
-void max8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max8u, cv_hal_max8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar)
-    vBinOp<uchar, cv::OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max8s, cv_hal_max8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max16u, cv_hal_max16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort)
-    vBinOp<ushort, cv::OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max16s, cv_hal_max16s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<short, cv::OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max32s, cv_hal_max32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max32f, cv_hal_max32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float)
-    vBinOp32<float, cv::OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void max64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(max64f, cv_hal_max64f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double)
-    vBinOp64<double, cv::OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-// Min
-//=======================================
-
-void min8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min8u, cv_hal_min8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar)
-    vBinOp<uchar, cv::OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min8s, cv_hal_min8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min16u, cv_hal_min16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort)
-    vBinOp<ushort, cv::OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min16s, cv_hal_min16s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<short, cv::OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min32s, cv_hal_min32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min32f, cv_hal_min32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMinEvery_32f, float)
-    vBinOp32<float, cv::OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void min64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(min64f, cv_hal_min64f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_MIN_MAX(ippsMinEvery_64f, double)
-    vBinOp64<double, cv::OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-// AbsDiff
-//=======================================
-
-void absdiff8u( const uchar* src1, size_t step1,
-                       const uchar* src2, size_t step2,
-                       uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff8u, cv_hal_absdiff8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R)
-    (vBinOp<uchar, cv::OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void absdiff8s( const schar* src1, size_t step1,
-                       const schar* src2, size_t step2,
-                       schar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff8s, cv_hal_absdiff8s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<schar, cv::OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void absdiff16u( const ushort* src1, size_t step1,
-                        const ushort* src2, size_t step2,
-                        ushort* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff16u, cv_hal_absdiff16u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R)
-    (vBinOp<ushort, cv::OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void absdiff16s( const short* src1, size_t step1,
-                        const short* src2, size_t step2,
-                        short* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff16s, cv_hal_absdiff16s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp<short, cv::OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void absdiff32s( const int* src1, size_t step1,
-                        const int* src2, size_t step2,
-                        int* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff32s, cv_hal_absdiff32s, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp32<int, cv::OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-void absdiff32f( const float* src1, size_t step1,
-                        const float* src2, size_t step2,
-                        float* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff32f, cv_hal_absdiff32f, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R)
-    (vBinOp32<float, cv::OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void absdiff64f( const double* src1, size_t step1,
-                        const double* src2, size_t step2,
-                        double* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(absdiff64f, cv_hal_absdiff64f, src1, step1, src2, step2, dst, step, width, height)
-    vBinOp64<double, cv::OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, width, height);
-}
-
-//=======================================
-// Logical
-//=======================================
-
-#if (ARITHM_USE_IPP == 1)
-#define CALL_IPP_UN(fun) \
-    CV_IPP_CHECK() \
-    { \
-        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); CV_UNUSED(src2); \
-        if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, dst, (int)step, ippiSize(width, height))) \
-        { \
-            CV_IMPL_ADD(CV_IMPL_IPP); \
-            return; \
-        } \
-        setIppErrorStatus(); \
-    }
-#else
-#define CALL_IPP_UN(fun)
-#endif
-
-void and8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(and8u, cv_hal_and8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiAnd_8u_C1R)
-    (vBinOp<uchar, cv::OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void or8u( const uchar* src1, size_t step1,
-                  const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(or8u, cv_hal_or8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiOr_8u_C1R)
-    (vBinOp<uchar, cv::OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void xor8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(xor8u, cv_hal_xor8u, src1, step1, src2, step2, dst, step, width, height)
-    CALL_IPP_BIN_12(ippiXor_8u_C1R)
-    (vBinOp<uchar, cv::OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-void not8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* )
-{
-    CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height)
-    CALL_IPP_UN(ippiNot_8u_C1R)
-    (vBinOp<uchar, cv::OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
-}
-
-//=======================================
-
-#if ARITHM_USE_IPP
-inline static IppCmpOp convert_cmp(int _cmpop)
-{
-    return _cmpop == CMP_EQ ? ippCmpEq :
-        _cmpop == CMP_GT ? ippCmpGreater :
-        _cmpop == CMP_GE ? ippCmpGreaterEq :
-        _cmpop == CMP_LT ? ippCmpLess :
-        _cmpop == CMP_LE ? ippCmpLessEq :
-        (IppCmpOp)-1;
-}
-#define CALL_IPP_CMP(fun) \
-    CV_IPP_CHECK() \
-    { \
-        IppCmpOp op = convert_cmp(*(int *)_cmpop); \
-        if( op  >= 0 ) \
-        { \
-            fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
-            if (0 <= CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \
-            { \
-                CV_IMPL_ADD(CV_IMPL_IPP); \
-                return; \
-            } \
-            setIppErrorStatus(); \
-        } \
-    }
-#else
-#define CALL_IPP_CMP(fun)
-#endif
-
-//=======================================
-// Compare
-//=======================================
-
-void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp8u, cv_hal_cmp8u, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    CALL_IPP_CMP(ippiCompare_8u_C1R)
-  //vz optimized  cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-    int code = *(int*)_cmpop;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
-    {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
-
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x =0;
-#if CV_SIMD128
-            if( hasSIMD128() )
-            {
-                v_uint8x16 mask = v_setall_u8((uchar)m);
-
-                for( ; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes )
-                {
-                    v_store(dst + x, (v_load(src1 + x) > v_load(src2 + x)) ^ mask);
-                }
-            }
-#endif
-
-            for( ; x < width; x++ ){
-                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-            }
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-#if CV_SIMD128
-            if( hasSIMD128() )
-            {
-                v_uint8x16 mask = v_setall_u8((uchar)m);
-
-                for( ; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes )
-                {
-                    v_store(dst+x, (v_load(src1+x) == v_load(src2+x)) ^ mask);
-                }
-            }
-#endif
-           for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
-}
-
-void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp8s, cv_hal_cmp8s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp16u, cv_hal_cmp16u, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    CALL_IPP_CMP(ippiCompare_16u_C1R)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp16s, cv_hal_cmp16s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    CALL_IPP_CMP(ippiCompare_16s_C1R)
-   //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-
-    int code = *(int*)_cmpop;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
-    {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
-
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x =0;
-#if CV_SIMD128
-            if( hasSIMD128() )
-            {
-                v_uint8x16 mask = v_setall_u8((uchar)m);
-                const int dWidth = v_uint8x16::nlanes;
-
-                for( ; x <= width - dWidth; x += dWidth )
-                {
-                    v_int16x8 in1 = v_load(src1 + x);
-                    v_int16x8 in2 = v_load(src2 + x);
-                    v_uint16x8 t1 = v_reinterpret_as_u16(in1 > in2);
-
-                    in1 = v_load(src1 + x + v_uint16x8::nlanes);
-                    in2 = v_load(src2 + x + v_uint16x8::nlanes);
-                    v_uint16x8 t2 = v_reinterpret_as_u16(in1 > in2);
-
-                    v_store(dst+x, (v_pack(t1, t2)) ^ mask);
-                }
-            }
-#endif
-            for( ; x < width; x++ ){
-                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-            }
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-#if CV_SIMD128
-            if( hasSIMD128() )
-            {
-                v_uint8x16 mask = v_setall_u8((uchar)m);
-                const int dWidth = v_uint8x16::nlanes;
-
-                for( ; x <= width - dWidth; x += dWidth )
-                {
-                    v_int16x8 in1 = v_load(src1 + x);
-                    v_int16x8 in2 = v_load(src2 + x);
-                    v_uint16x8 t1 = v_reinterpret_as_u16(in1 == in2);
-
-                    in1 = v_load(src1 + x + 8);
-                    in2 = v_load(src2 + x + 8);
-                    v_uint16x8 t2 = v_reinterpret_as_u16(in1 == in2);
-
-                    v_store(dst+x, (v_pack(t1, t2)^ mask));
-                }
-            }
-#endif
-            for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
-}
-
-void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp32s, cv_hal_cmp32s, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp32f, cv_hal_cmp32f, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    CALL_IPP_CMP(ippiCompare_32f_C1R)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* _cmpop)
-{
-    CALL_HAL(cmp64f, cv_hal_cmp64f, src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop)
-    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
-}
-
-//=======================================
-
-#if defined HAVE_IPP
-#define CALL_IPP_MUL(fun) \
-    CV_IPP_CHECK() \
-    { \
-        if (std::fabs(fscale - 1) <= FLT_EPSILON) \
-        { \
-            if (CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \
-            { \
-                CV_IMPL_ADD(CV_IMPL_IPP); \
-                return; \
-            } \
-            setIppErrorStatus(); \
-        } \
-    }
-
-#define CALL_IPP_MUL_2(fun) \
-    CV_IPP_CHECK() \
-    { \
-        if (std::fabs(fscale - 1) <= FLT_EPSILON) \
-        { \
-            if (CV_INSTRUMENT_FUN_IPP(fun, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)) >= 0) \
-            { \
-                CV_IMPL_ADD(CV_IMPL_IPP); \
-                return; \
-            } \
-            setIppErrorStatus(); \
-        } \
-    }
-
-#else
-#define CALL_IPP_MUL(fun)
-#define CALL_IPP_MUL_2(fun)
-#endif
-
-//=======================================
-// Multilpy
-//=======================================
-
-void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul8u, cv_hal_mul8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    float fscale = (float)*(const double*)scale;
-    CALL_IPP_MUL(ippiMul_8u_C1RSfs)
-    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
-}
-
-void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                   schar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul8s, cv_hal_mul8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale);
-}
-
-void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul16u, cv_hal_mul16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    float fscale = (float)*(const double*)scale;
-    CALL_IPP_MUL(ippiMul_16u_C1RSfs)
-    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
-}
-
-void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul16s, cv_hal_mul16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    float fscale = (float)*(const double*)scale;
-    CALL_IPP_MUL(ippiMul_16s_C1RSfs)
-    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
-}
-
-void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul32s, cv_hal_mul32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul32f, cv_hal_mul32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    float fscale = (float)*(const double*)scale;
-    CALL_IPP_MUL_2(ippiMul_32f_C1R)
-    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
-}
-
-void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(mul64f, cv_hal_mul64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-//=======================================
-// Divide
-//=======================================
-
-void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div8u, cv_hal_div8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    if( src1 )
-        div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-    else
-        recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                  schar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div8s, cv_hal_div8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div16u, cv_hal_div16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                    short* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div16s, cv_hal_div16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                    int* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div32s, cv_hal_div32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                    float* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div32f, cv_hal_div32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                    double* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(div64f, cv_hal_div64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-//=======================================
-// Reciprocial
-//=======================================
-
-void recip8u( const uchar*, size_t, const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip8u, cv_hal_recip8u, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip8s( const schar*, size_t, const schar* src2, size_t step2,
-                  schar* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip8s, cv_hal_recip8s, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip16u( const ushort*, size_t, const ushort* src2, size_t step2,
-                   ushort* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip16u, cv_hal_recip16u, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip16s( const short*, size_t, const short* src2, size_t step2,
-                   short* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip16s, cv_hal_recip16s, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip32s( const int*, size_t, const int* src2, size_t step2,
-                   int* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip32s, cv_hal_recip32s, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip32f( const float*, size_t, const float* src2, size_t step2,
-                   float* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip32f, cv_hal_recip32f, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_f(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-void recip64f( const double*, size_t, const double* src2, size_t step2,
-                   double* dst, size_t step, int width, int height, void* scale)
-{
-    CALL_HAL(recip64f, cv_hal_recip64f, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_f(src2, step2, dst, step, width, height, *(const double*)scale);
-}
-
-//=======================================
-// Add weighted
-//=======================================
-
-void
-addWeighted8u( const uchar* src1, size_t step1,
-               const uchar* src2, size_t step2,
-               uchar* dst, size_t step, int width, int height,
-               void* scalars )
-{
-    CALL_HAL(addWeighted8u, cv_hal_addWeighted8u, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    const double* scalars_ = (const double*)scalars;
-    float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2];
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int x = 0;
-
-#if CV_SIMD128
-        if( hasSIMD128() )
-        {
-            v_float32x4 g = v_setall_f32(gamma);
-            v_float32x4 a = v_setall_f32(alpha);
-            v_float32x4 b = v_setall_f32(beta);
-
-            for( ; x <= width - v_uint16x8::nlanes; x += v_uint16x8::nlanes )
-            {
-                v_uint16x8 in1_16 = v_load_expand(src1 + x);
-                v_int32x4 in1_32_l, in1_32_h;
-                v_expand(v_reinterpret_as_s16(in1_16), in1_32_l, in1_32_h);
-                v_float32x4 in1_f_l = v_cvt_f32(in1_32_l);
-                v_float32x4 in1_f_h = v_cvt_f32(in1_32_h);
-
-                v_uint16x8 in2_16 = v_load_expand(src2 + x);
-                v_int32x4 in2_32_l, in2_32_h;
-                v_expand(v_reinterpret_as_s16(in2_16), in2_32_l, in2_32_h);
-                v_float32x4 in2_f_l = v_cvt_f32(in2_32_l);
-                v_float32x4 in2_f_h = v_cvt_f32(in2_32_h);
-
-                v_int32x4 out_l = v_round(in1_f_l * a + in2_f_l * b + g);
-                v_int32x4 out_h = v_round(in1_f_h * a + in2_f_h * b + g);
-
-                v_int16x8 out_16 = v_pack(out_l, out_h);
-                v_pack_u_store(dst + x, out_16);
-            }
-        }
-#endif
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            float t0, t1;
-            t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
-            t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
-
-            dst[x] = saturate_cast<uchar>(t0);
-            dst[x+1] = saturate_cast<uchar>(t1);
-
-            t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
-            t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
-
-            dst[x+2] = saturate_cast<uchar>(t0);
-            dst[x+3] = saturate_cast<uchar>(t1);
-        }
-        #endif
-
-        for( ; x < width; x++ )
-        {
-            float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
-            dst[x] = saturate_cast<uchar>(t0);
-        }
-    }
-}
-
-void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                           schar* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted8s, cv_hal_addWeighted8s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                            ushort* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted16u, cv_hal_addWeighted16u, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                            short* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted16s, cv_hal_addWeighted16s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<short, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                            int* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted32s, cv_hal_addWeighted32s, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<int, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                            float* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted32f, cv_hal_addWeighted32f, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<float, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                            double* dst, size_t step, int width, int height, void* scalars )
-{
-    CALL_HAL(addWeighted64f, cv_hal_addWeighted64f, src1, step1, src2, step2, dst, step, width, height, (const double*)scalars)
-    addWeighted_<double, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
-}
-
-}} // cv::hal::
-
 /* End of file. */
diff --git a/modules/core/src/arithm.dispatch.cpp b/modules/core/src/arithm.dispatch.cpp
new file mode 100644
index 0000000000..1cbceaee29
--- /dev/null
+++ b/modules/core/src/arithm.dispatch.cpp
@@ -0,0 +1,11 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "arithm_ipp.hpp"
+#include "arithm.simd.hpp"
+#include "arithm.simd_declarations.hpp"
+
+#define ARITHM_DISPATCHING_ONLY
+#include "arithm.simd.hpp"
\ No newline at end of file
diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp
new file mode 100644
index 0000000000..b97842f7ca
--- /dev/null
+++ b/modules/core/src/arithm.simd.hpp
@@ -0,0 +1,1913 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "opencv2/core/hal/intrin.hpp"
+
+//=========================================
+// Declare & Define & Dispatch in one step
+//=========================================
+
+// ARITHM_DISPATCHING_ONLY defined by arithm dispatch file
+
+#undef ARITHM_DECLARATIONS_ONLY
+#ifdef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+    #define ARITHM_DECLARATIONS_ONLY
+#endif
+
+#undef ARITHM_DEFINITIONS_ONLY
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && !defined(ARITHM_DISPATCHING_ONLY)
+    #define ARITHM_DEFINITIONS_ONLY
+#endif
+
+#ifdef ARITHM_DECLARATIONS_ONLY
+    #undef DEFINE_SIMD
+    #define DEFINE_SIMD(fun_name, c_type, ...) \
+        DECLARE_SIMD_FUN(fun_name, c_type)
+#endif // ARITHM_DECLARATIONS_ONLY
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+    #undef DEFINE_SIMD
+    #define DEFINE_SIMD(fun_name, c_type, v_type, ...)          \
+        DECLARE_SIMD_FUN(fun_name, c_type)                      \
+        DEFINE_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__)
+#endif // ARITHM_DEFINITIONS_ONLY
+
+#ifdef ARITHM_DISPATCHING_ONLY
+    #undef DEFINE_SIMD
+    #define DEFINE_SIMD(fun_name, c_type, v_type, ...)           \
+        DISPATCH_SIMD_FUN(fun_name, c_type, v_type, __VA_ARGS__)
+#endif // ARITHM_DISPATCHING_ONLY
+
+// workaround when neon miss support of double precision
+#undef DEFINE_NOSIMD
+#ifdef ARITHM_DEFINITIONS_ONLY
+    #define DEFINE_NOSIMD(fun_name, c_type, ...)          \
+        DECLARE_SIMD_FUN(fun_name, c_type)                \
+        DEFINE_NOSIMD_FUN(fun_name, c_type, __VA_ARGS__)
+#else
+    #define DEFINE_NOSIMD DEFINE_SIMD
+#endif // ARITHM_DEFINITIONS_ONLY
+
+#ifndef SIMD_GUARD
+
+#define DEFINE_SIMD_U8(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 8u), uchar, v_uint8, __VA_ARGS__)
+
+#define DEFINE_SIMD_S8(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 8s), schar, v_int8,  __VA_ARGS__)
+
+#define DEFINE_SIMD_U16(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 16u), ushort, v_uint16, __VA_ARGS__)
+
+#define DEFINE_SIMD_S16(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 16s), short, v_int16,  __VA_ARGS__)
+
+#define DEFINE_SIMD_S32(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 32s), int, v_int32,  __VA_ARGS__)
+
+#define DEFINE_SIMD_F32(fun, ...) \
+    DEFINE_SIMD(__CV_CAT(fun, 32f), float, v_float32, __VA_ARGS__)
+
+#if CV_SIMD_64F
+    #define DEFINE_SIMD_F64(fun, ...) \
+        DEFINE_SIMD(__CV_CAT(fun, 64f), double, v_float64, __VA_ARGS__)
+#else
+    #define DEFINE_SIMD_F64(fun, ...) \
+        DEFINE_NOSIMD(__CV_CAT(fun, 64f), double, __VA_ARGS__)
+#endif
+
+#define DEFINE_SIMD_SAT(fun, ...)      \
+    DEFINE_SIMD_U8(fun, __VA_ARGS__)   \
+    DEFINE_SIMD_S8(fun, __VA_ARGS__)   \
+    DEFINE_SIMD_U16(fun, __VA_ARGS__)  \
+    DEFINE_SIMD_S16(fun, __VA_ARGS__)
+
+#define DEFINE_SIMD_NSAT(fun, ...)     \
+    DEFINE_SIMD_S32(fun, __VA_ARGS__)  \
+    DEFINE_SIMD_F32(fun, __VA_ARGS__)  \
+    DEFINE_SIMD_F64(fun, __VA_ARGS__)
+
+#define DEFINE_SIMD_ALL(fun, ...)      \
+    DEFINE_SIMD_SAT(fun, __VA_ARGS__)  \
+    DEFINE_SIMD_NSAT(fun, __VA_ARGS__)
+
+#endif // SIMD_GUARD
+
+///////////////////////////////////////////////////////////////////////////
+
+namespace cv { namespace hal {
+
+#ifndef ARITHM_DISPATCHING_ONLY
+    CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+#endif
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+#if !CV_SIMD_64F
+typedef int v_float64; // dummy
+#endif
+
+//=======================================
+// Utility
+//=======================================
+
+/** add **/
+template<typename T>
+static inline T c_add(T a, T b)
+{ return saturate_cast<T>(a + b); }
+template<>
+inline uchar c_add<uchar>(uchar a, uchar b)
+{ return CV_FAST_CAST_8U(a + b); }
+// scale
+template<typename T1, typename T2>
+static inline T1 c_add(T1 a, T1 b, T2 scalar)
+{ return saturate_cast<T1>((T2)a * scalar + b); }
+template<>
+inline uchar c_add<uchar, float>(uchar a, uchar b, float scalar)
+{ return saturate_cast<uchar>(CV_8TO32F(a) * scalar + b); }
+// weight
+template<typename T1, typename T2>
+static inline T1 c_add(T1 a, T1 b, T2 alpha, T2 beta, T2 gamma)
+{ return saturate_cast<T1>(a * alpha + b * beta + gamma); }
+template<>
+inline uchar c_add<uchar, float>(uchar a, uchar b, float alpha, float beta, float gamma)
+{ return saturate_cast<uchar>(CV_8TO32F(a) * alpha + CV_8TO32F(b) * beta + gamma); }
+
+/** sub **/
+template<typename T>
+static inline T c_sub(T a, T b)
+{ return saturate_cast<T>(a - b); }
+template<>
+inline uchar c_sub<uchar>(uchar a, uchar b)
+{ return CV_FAST_CAST_8U(a - b); }
+
+/** max **/
+template<typename T>
+static inline T c_max(T a, T b)
+{ return std::max(a, b); }
+template<>
+inline uchar c_max<uchar>(uchar a, uchar b)
+{ return CV_MAX_8U(a, b); }
+
+/** min **/
+template<typename T>
+static inline T c_min(T a, T b)
+{ return std::min(a, b); }
+template<>
+inline uchar c_min<uchar>(uchar a, uchar b)
+{ return CV_MIN_8U(a, b); }
+
+/** absdiff **/
+template<typename T>
+static inline T c_absdiff(T a, T b)
+{ return a > b ? a - b : b - a; }
+template<>
+inline schar c_absdiff(schar a, schar b)
+{ return saturate_cast<schar>(std::abs(a - b)); }
+template<>
+inline short c_absdiff(short a, short b)
+{ return saturate_cast<short>(std::abs(a - b)); }
+// specializations to prevent "-0" results
+template<>
+inline float c_absdiff<float>(float a, float b)
+{ return std::abs(a - b); }
+template<>
+inline double c_absdiff<double>(double a, double b)
+{ return std::abs(a - b); }
+
+/** multiply **/
+template<typename T>
+static inline T c_mul(T a, T b)
+{ return saturate_cast<T>(a * b); }
+template<>
+inline uchar c_mul<uchar>(uchar a, uchar b)
+{ return CV_FAST_CAST_8U(a * b); }
+// scale
+template<typename T1, typename T2>
+static inline T1 c_mul(T1 a, T1 b, T2 scalar)
+{ return saturate_cast<T1>(scalar * (T2)a * b); }
+template<>
+inline uchar c_mul<uchar, float>(uchar a, uchar b, float scalar)
+{ return saturate_cast<uchar>(scalar * CV_8TO32F(a) * CV_8TO32F(b)); }
+
+/** divide & reciprocal **/
+template<typename T1, typename T2>
+static inline T2 c_div(T1 a, T2 b)
+{ return saturate_cast<T2>(a / b); }
+// recip
+template<>
+inline uchar c_div<float, uchar>(float a, uchar b)
+{ return saturate_cast<uchar>(a / CV_8TO32F(b)); }
+// scale
+template<typename T1, typename T2>
+static inline T1 c_div(T1 a, T1 b, T2 scalar)
+{ return saturate_cast<T1>(scalar * (T2)a / b); }
+template<>
+inline uchar c_div<uchar, float>(uchar a, uchar b, float scalar)
+{ return saturate_cast<uchar>(scalar * CV_8TO32F(a) / CV_8TO32F(b)); }
+
+//=======================================
+// Arithmetic and logical operations
+// +, -, *, /, &, |, ^, ~, abs ...
+//=======================================
+
+///////////////////////////// Operations //////////////////////////////////
+
+// Add
+template<typename T1, typename Tvec>
+struct op_add
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a + b; }
+    static inline T1 r(T1 a, T1 b)
+    { return c_add(a, b); }
+};
+
+// Subtract
+template<typename T1, typename Tvec>
+struct op_sub
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a - b; }
+    static inline T1 r(T1 a, T1 b)
+    { return c_sub(a, b); }
+};
+
+// Max & Min
+template<typename T1, typename Tvec>
+struct op_max
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return v_max(a, b); }
+    static inline T1 r(T1 a, T1 b)
+    { return c_max(a, b); }
+};
+
+template<typename T1, typename Tvec>
+struct op_min
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return v_min(a, b); }
+    static inline T1 r(T1 a, T1 b)
+    { return c_min(a, b); }
+};
+
+// Absolute difference
+template<typename T1, typename Tvec>
+struct op_absdiff
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return v_absdiff(a, b); }
+    static inline T1 r(T1 a, T1 b)
+    { return c_absdiff(a, b); }
+};
+// Signed absolute difference, 's'
+template<>
+struct op_absdiff<schar, v_int8>
+{
+    static inline v_int8 r(const v_int8& a, const v_int8& b)
+    { return v_absdiffs(a, b); }
+    static inline schar r(schar a, schar b)
+    { return c_absdiff(a, b); }
+};
+template<>
+struct op_absdiff<short, v_int16>
+{
+    static inline v_int16 r(const v_int16& a, const v_int16& b)
+    { return v_absdiffs(a, b); }
+    static inline short r(short a, short b)
+    { return c_absdiff(a, b); }
+};
+template<>
+struct op_absdiff<int, v_int32>
+{
+    static inline v_int32 r(const v_int32& a, const v_int32& b)
+    { return v_reinterpret_as_s32(v_absdiff(a, b)); }
+    static inline int r(int a, int b)
+    { return c_absdiff(a, b); }
+};
+
+// Logical
+template<typename T1, typename Tvec>
+struct op_or
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a | b; }
+    static inline T1 r(T1 a, T1 b)
+    { return a | b; }
+};
+template<typename T1, typename Tvec>
+struct op_xor
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a ^ b; }
+    static inline T1 r(T1 a, T1 b)
+    { return a ^ b; }
+};
+template<typename T1, typename Tvec>
+struct op_and
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a & b; }
+    static inline T1 r(T1 a, T1 b)
+    { return a & b; }
+};
+template<typename T1, typename Tvec>
+struct op_not
+{
+    // ignored b from loader level
+    static inline Tvec r(const Tvec& a)
+    { return ~a; }
+    static inline T1 r(T1 a, T1)
+    { return ~a; }
+};
+
+//////////////////////////// Loaders /////////////////////////////////
+
+#if CV_SIMD
+
+template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct bin_loader
+{
+    typedef OP<T1, Tvec> op;
+
+    static inline void l(const T1* src1, const T1* src2, T1* dst)
+    {
+        Tvec a = vx_load(src1);
+        Tvec b = vx_load(src2);
+        v_store(dst, op::r(a, b));
+    }
+
+    static inline void la(const T1* src1, const T1* src2, T1* dst)
+    {
+        Tvec a = vx_load_aligned(src1);
+        Tvec b = vx_load_aligned(src2);
+        v_store_aligned(dst, op::r(a, b)); // todo: try write without cache
+    }
+
+    static inline void l64(const T1* src1, const T1* src2, T1* dst)
+    {
+        Tvec a = vx_load_low(src1), b = vx_load_low(src2);
+        v_store_low(dst, op::r(a, b));
+    }
+};
+
+// void src2 for operation "not"
+template<typename T1, typename Tvec>
+struct bin_loader<op_not, T1, Tvec>
+{
+    typedef op_not<T1, Tvec> op;
+
+    static inline void l(const T1* src1, const T1*, T1* dst)
+    {
+        Tvec a = vx_load(src1);
+        v_store(dst, op::r(a));
+    }
+
+    static inline void la(const T1* src1, const T1*, T1* dst)
+    {
+        Tvec a = vx_load_aligned(src1);
+        v_store_aligned(dst, op::r(a));
+    }
+
+    static inline void l64(const T1* src1, const T1*, T1* dst)
+    {
+        Tvec a = vx_load_low(src1);
+        v_store_low(dst, op::r(a));
+    }
+};
+
+#endif // CV_SIMD
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename T2>
+static inline bool is_aligned(const T1* src1, const T1* src2, const T2* dst)
+{ return (((size_t)src1|(size_t)src2|(size_t)dst) & (CV_SIMD_WIDTH - 1)) == 0; }
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
+{
+    typedef OP<T1, Tvec> op;
+#if CV_SIMD
+    typedef bin_loader<OP, T1, Tvec> ldr;
+    enum {wide_step = Tvec::nlanes};
+    #if !CV_NEON && CV_SIMD_WIDTH == 16
+        enum {wide_step_l = wide_step * 2};
+    #else
+        enum {wide_step_l = wide_step};
+    #endif
+#endif // CV_SIMD
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+    #if CV_SIMD
+        #if !CV_NEON
+        if (is_aligned(src1, src2, dst))
+        {
+            for (; x <= width - wide_step_l; x += wide_step_l)
+            {
+                ldr::la(src1 + x, src2 + x, dst + x);
+                #if !CV_NEON && CV_SIMD_WIDTH == 16
+                ldr::la(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
+                #endif
+            }
+        }
+        else
+        #endif
+            for (; x <= width - wide_step_l; x += wide_step_l)
+            {
+                ldr::l(src1 + x, src2 + x, dst + x);
+                #if !CV_NEON && CV_SIMD_WIDTH == 16
+                ldr::l(src1 + x + wide_step, src2 + x + wide_step, dst + x + wide_step);
+                #endif
+            }
+
+        #if CV_SIMD_WIDTH == 16
+        for (; x <= width - 8/(int)sizeof(T1); x += 8/(int)sizeof(T1))
+        {
+            ldr::l64(src1 + x, src2 + x, dst + x);
+        }
+        #endif
+    #endif // CV_SIMD
+
+    #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], src2[x]);
+            T1 t1 = op::r(src1[x + 1], src2[x + 1]);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2]);
+            t1 = op::r(src1[x + 3], src2[x + 3]);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+    #endif
+
+        for (; x < width; x++)
+            dst[x] = op::r(src1[x], src2[x]);
+    }
+
+    vx_cleanup();
+}
+
+#if !CV_SIMD_64F
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+static void bin_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
+{
+    typedef OP<T1, Tvec/*dummy*/> op;
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], src2[x]);
+            T1 t1 = op::r(src1[x + 1], src2[x + 1]);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2]);
+            t1 = op::r(src1[x + 3], src2[x + 3]);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+
+        for (; x < width; x++)
+            dst[x] = op::r(src1[x], src2[x]);
+    }
+}
+#define BIN_LOOP64F bin_loop_nosimd
+#else
+#define BIN_LOOP64F bin_loop
+#endif //!CV_SIMD_64F
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef SIMD_GUARD
+#define BIN_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+                      _T1* dst, size_t step, int width, int height
+
+#define BIN_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+#endif // SIMD_GUARD
+
+#undef DECLARE_SIMD_FUN
+#define DECLARE_SIMD_FUN(fun, _T1) void fun(BIN_ARGS(_T1));
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, _OP)                           \
+    void fun(BIN_ARGS(_T1), void*)                                        \
+    {                                                                     \
+        CV_INSTRUMENT_REGION();                                           \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun), BIN_ARGS_PASS)              \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), BIN_ARGS_PASS)        \
+        CV_CPU_DISPATCH(fun, (BIN_ARGS_PASS), CV_CPU_DISPATCH_MODES_ALL); \
+    }
+
+#undef DEFINE_SIMD_FUN
+#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, _OP)     \
+    void fun(BIN_ARGS(_T1))                       \
+    {                                             \
+        CV_INSTRUMENT_REGION();                   \
+        bin_loop<_OP, _T1, _Tvec>(BIN_ARGS_PASS); \
+    }
+
+#undef DEFINE_NOSIMD_FUN
+#define DEFINE_NOSIMD_FUN(fun, _T1, _OP)                     \
+    void fun(BIN_ARGS(_T1))                                  \
+    {                                                        \
+        CV_INSTRUMENT_REGION();                              \
+        bin_loop_nosimd<_OP, _T1, v_float64>(BIN_ARGS_PASS); \
+    }
+
+DEFINE_SIMD_ALL(add, op_add)
+DEFINE_SIMD_ALL(sub, op_sub)
+
+DEFINE_SIMD_ALL(min, op_min)
+DEFINE_SIMD_ALL(max, op_max)
+
+DEFINE_SIMD_ALL(absdiff, op_absdiff)
+
+DEFINE_SIMD_U8(or,  op_or)
+DEFINE_SIMD_U8(xor, op_xor)
+DEFINE_SIMD_U8(and, op_and)
+
+// One source!, an exception for operation "not"
+// we could use macros here but it's better to implement it
+// with that way to give more clarification
+// about how macroS "DEFINE_SIMD_*" are works
+
+#if defined(ARITHM_DECLARATIONS_ONLY) || defined(ARITHM_DEFINITIONS_ONLY)
+void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
+#endif
+#ifdef ARITHM_DEFINITIONS_ONLY
+void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    CV_INSTRUMENT_REGION();
+    bin_loop<op_not, uchar, v_uint8>(src1, step1, src2, step2, dst, step, width, height);
+}
+#endif
+#ifdef ARITHM_DISPATCHING_ONLY
+void not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void*)
+{
+    CV_INSTRUMENT_REGION();
+    CALL_HAL(not8u, cv_hal_not8u, src1, step1, dst, step, width, height)
+    ARITHM_CALL_IPP(arithm_ipp_not8u, src1, step1, dst, step, width, height)
+    CV_CPU_DISPATCH(not8u, (src1, step1, src2, step2, dst, step, width, height), CV_CPU_DISPATCH_MODES_ALL);
+}
+#endif
+
+//=======================================
+// Compare
+//=======================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+template<typename T1, typename Tvec>
+struct op_cmplt
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a < b; }
+    static inline uchar r(T1 a, T1 b)
+    { return (uchar)-(int)(a < b); }
+};
+
+template<typename T1, typename Tvec>
+struct op_cmple
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a <= b; }
+    static inline uchar r(T1 a, T1 b)
+    { return (uchar)-(int)(a <= b); }
+};
+
+template<typename T1, typename Tvec>
+struct op_cmpeq
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a == b; }
+    static inline uchar r(T1 a, T1 b)
+    { return (uchar)-(int)(a == b); }
+};
+
+template<typename T1, typename Tvec>
+struct op_cmpne
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a != b; }
+    static inline uchar r(T1 a, T1 b)
+    { return (uchar)-(int)(a != b); }
+};
+
+//////////////////////////// Loaders /////////////////////////////////
+
+#if CV_SIMD
+// todo: add support for RW alignment & stream
+template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n
+{
+    void l(const T1* src1, const T1* src2, uchar* dst);
+};
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n<sizeof(uchar), OP, T1, Tvec>
+{
+    typedef OP<T1, Tvec> op;
+
+    static inline void l(const T1* src1, const T1* src2, uchar* dst)
+    {
+        Tvec a = vx_load(src1);
+        Tvec b = vx_load(src2);
+        v_store(dst, v_reinterpret_as_u8(op::r(a, b)));
+    }
+};
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
+{
+    typedef OP<T1, Tvec> op;
+    enum {step = Tvec::nlanes};
+
+    static inline void l(const T1* src1, const T1* src2, uchar* dst)
+    {
+        Tvec c0 = op::r(vx_load(src1), vx_load(src2));
+        Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
+        v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
+    }
+};
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
+{
+    typedef OP<T1, Tvec> op;
+    enum {step = Tvec::nlanes};
+
+    static inline void l(const T1* src1, const T1* src2, uchar* dst)
+    {
+        v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
+        v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
+        v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
+        v_uint32 c3 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3)));
+        v_store(dst, v_pack_b(c0, c1, c2, c3));
+    }
+};
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
+{
+    typedef OP<T1, Tvec> op;
+    enum {step = Tvec::nlanes};
+
+    static inline void l(const T1* src1, const T1* src2, uchar* dst)
+    {
+        v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
+        v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
+        v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
+        v_uint64 c3 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 3), vx_load(src2 + step * 3)));
+
+        v_uint64 c4 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 4), vx_load(src2 + step * 4)));
+        v_uint64 c5 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 5), vx_load(src2 + step * 5)));
+        v_uint64 c6 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 6), vx_load(src2 + step * 6)));
+        v_uint64 c7 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 7), vx_load(src2 + step * 7)));
+        v_store(dst, v_pack_b(c0, c1, c2, c3, c4, c5, c6, c7));
+    }
+};
+
+#endif // CV_SIMD
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
+static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    typedef OP<T1, Tvec> op;
+#if CV_SIMD
+    typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
+    enum {wide_step = Tvec::nlanes * sizeof(T1)};
+#endif // CV_SIMD
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+    #if CV_SIMD
+        for (; x <= width - wide_step; x += wide_step)
+        {
+            ldr::l(src1 + x, src2 + x, dst + x);
+        }
+    #endif // CV_SIMD
+
+    #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+        for (; x <= width - 4; x += 4)
+        {
+            uchar t0 = op::r(src1[x], src2[x]);
+            uchar t1 = op::r(src1[x + 1], src2[x + 1]);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2]);
+            t1 = op::r(src1[x + 3], src2[x + 3]);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+    #endif
+
+        for (; x < width; x++)
+            dst[x] = op::r(src1[x], src2[x]);
+    }
+
+    vx_cleanup();
+}
+
+template<typename T1, typename Tvec>
+static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                     uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    switch(cmpop)
+    {
+    case CMP_LT:
+        cmp_loop<op_cmplt, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    case CMP_GT:
+        cmp_loop<op_cmplt, T1, Tvec>(src2, step2, src1, step1, dst, step, width, height);
+        break;
+    case CMP_LE:
+        cmp_loop<op_cmple, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    case CMP_GE:
+        cmp_loop<op_cmple, T1, Tvec>(src2, step2, src1, step1, dst, step, width, height);
+        break;
+    case CMP_EQ:
+        cmp_loop<op_cmpeq, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    default:
+        CV_Assert(cmpop == CMP_NE);
+        cmp_loop<op_cmpne, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    }
+}
+
+#if !CV_SIMD_64F
+template< template<typename T1, typename Tvec> class OP, typename T1>
+static void cmp_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    typedef OP<T1, v_int32 /*dummy*/> op;
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+        for (; x <= width - 4; x += 4)
+        {
+            uchar t0 = op::r(src1[x], src2[x]);
+            uchar t1 = op::r(src1[x + 1], src2[x + 1]);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2]);
+            t1 = op::r(src1[x + 3], src2[x + 3]);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+
+        for (; x < width; x++)
+            dst[x] = op::r(src1[x], src2[x]);
+    }
+}
+static void cmp_loop_nosimd(const double* src1, size_t step1, const double* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    switch(cmpop)
+    {
+    case CMP_LT:
+        cmp_loop_nosimd<op_cmplt, double>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    case CMP_GT:
+        cmp_loop_nosimd<op_cmplt, double>(src2, step2, src1, step1, dst, step, width, height);
+        break;
+    case CMP_LE:
+        cmp_loop_nosimd<op_cmple, double>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    case CMP_GE:
+        cmp_loop_nosimd<op_cmple, double>(src2, step2, src1, step1, dst, step, width, height);
+        break;
+    case CMP_EQ:
+        cmp_loop_nosimd<op_cmpeq, double>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    default:
+        CV_Assert(cmpop == CMP_NE);
+        cmp_loop_nosimd<op_cmpne, double>(src1, step1, src2, step2, dst, step, width, height);
+        break;
+    }
+}
+#endif // !CV_SIMD_64F
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef SIMD_GUARD
+#define CMP_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+                           uchar* dst, size_t step, int width, int height
+
+#define CMP_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+#endif // SIMD_GUARD
+
+#undef DECLARE_SIMD_FUN
+#define DECLARE_SIMD_FUN(fun, _T1) void fun(CMP_ARGS(_T1), int cmpop);
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...)                                          \
+    void fun(CMP_ARGS(_T1), void* _cmpop)                                                \
+    {                                                                                    \
+        CV_INSTRUMENT_REGION();                                                          \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun), CMP_ARGS_PASS, *(int*)_cmpop)              \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun), CMP_ARGS_PASS, *(int*)_cmpop)        \
+        CV_CPU_DISPATCH(fun, (CMP_ARGS_PASS, *(int*)_cmpop), CV_CPU_DISPATCH_MODES_ALL); \
+    }
+
+#undef DEFINE_SIMD_FUN
+#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, ...)       \
+    void fun(CMP_ARGS(_T1), int cmpop)              \
+    {                                               \
+        CV_INSTRUMENT_REGION();                     \
+        cmp_loop<_T1, _Tvec>(CMP_ARGS_PASS, cmpop); \
+    }
+
+#undef DEFINE_NOSIMD_FUN
+#define DEFINE_NOSIMD_FUN(fun, _T1, _Tvec, ...)     \
+    void fun(CMP_ARGS(_T1), int cmpop)              \
+    {                                               \
+        CV_INSTRUMENT_REGION();                     \
+        cmp_loop_nosimd(CMP_ARGS_PASS, cmpop);      \
+    }
+
+// todo: try to avoid define dispatcher functions using macros with these such cases
+DEFINE_SIMD_ALL(cmp)
+
+//=========================================================================
+// scaling helpers for single and dual source
+//
+// Dual: Multiply, Div, AddWeighted
+//
+// Single: Reciprocal
+//
+//=========================================================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////// Loaders ///////////////////////////////
+
+#if CV_SIMD
+// todo: add support for RW alignment & stream
+template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+struct scalar_loader_n
+{
+    void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst);
+    // single source
+    void l(const T1* src1, const T2* scalar, T1* dst);
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+struct scalar_loader_n<sizeof(uchar), OP, T1, T2, Tvec>
+{
+    typedef OP<T1, T2, v_int16> op;
+
+    static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst)
+    {
+        v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1));
+        v_int16 v_src2 = v_reinterpret_as_s16(vx_load_expand(src2));
+
+        v_int32 t0, t1, t2, t3;
+        v_expand(v_src1, t0, t2);
+        v_expand(v_src2, t1, t3);
+
+        v_float32 f0, f1, f2, f3;
+        f0 = v_cvt_f32(t0);
+        f1 = v_cvt_f32(t1);
+        f2 = v_cvt_f32(t2);
+        f3 = v_cvt_f32(t3);
+
+        f0 = op::r(f0, f1, scalar);
+        f2 = op::r(f2, f3, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f2);
+
+        store(dst, v_src2, r0, r1);
+    }
+
+    static inline void l(const T1* src1, const T2* scalar, T1* dst)
+    {
+        v_int16 v_src1 = v_reinterpret_as_s16(vx_load_expand(src1));
+
+        v_int32 t0, t1;
+        v_expand(v_src1, t0, t1);
+
+        v_float32 f0, f1;
+        f0 = v_cvt_f32(t0);
+        f1 = v_cvt_f32(t1);
+
+        f0 = op::r(f0, scalar);
+        f1 = op::r(f1, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f1);
+
+        store(dst, v_src1, r0, r1);
+    }
+
+    static inline void store(uchar* dst, const v_int16& src, const v_int32& a, const v_int32& b)
+    {
+        v_pack_u_store(dst, op::pre(src, v_pack(a, b)));
+    }
+    static inline void store(schar* dst, const v_int16& src, const v_int32& a, const v_int32& b)
+    {
+        v_pack_store(dst, op::pre(src, v_pack(a, b)));
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+struct scalar_loader_n<sizeof(ushort), OP, T1, T2, Tvec>
+{
+    typedef typename V_RegTraits<Tvec>::w_reg Twvec;
+    typedef OP<T1, T2, Tvec> op;
+
+    static inline void l(const T1* src1, const T1* src2, const T2* scalar, T1* dst)
+    {
+        Tvec v_src1 = vx_load(src1);
+        Tvec v_src2 = vx_load(src2);
+
+        Twvec t0, t1, t2, t3;
+        v_expand(v_src1, t0, t2);
+        v_expand(v_src2, t1, t3);
+
+        v_float32 f0, f1, f2, f3;
+        f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+        f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+        f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+        f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+        f0 = op::r(f0, f1, scalar);
+        f2 = op::r(f2, f3, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f2);
+
+        store(dst, v_src2, r0, r1);
+    }
+
+    static inline void l(const T1* src1, const T2* scalar, T1* dst)
+    {
+        Tvec v_src1 = vx_load(src1);
+
+        Twvec t0, t1;
+        v_expand(v_src1, t0, t1);
+
+        v_float32 f0, f1;
+        f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+        f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+        f0 = op::r(f0, scalar);
+        f1 = op::r(f1, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f1);
+
+        store(dst, v_src1, r0, r1);
+    }
+
+    static inline void store(ushort* dst, const Tvec& src, const v_int32& a, const v_int32& b)
+    {
+        v_store(dst, op::pre(src, v_pack_u(a, b)));
+    }
+    static inline void store(short* dst, const Tvec& src, const v_int32& a, const v_int32& b)
+    {
+        v_store(dst, op::pre(src, v_pack(a, b)));
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T2>
+struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
+{
+    typedef OP<int, T2, v_int32> op;
+    enum {step = v_int32::nlanes};
+
+    static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src2 = vx_load(src2);
+        v_int32 v_src1s = vx_load(src1 + step);
+        v_int32 v_src2s = vx_load(src2 + step);
+
+        v_float32 f0, f1, f2, f3;
+        f0 = v_cvt_f32(v_reinterpret_as_s32(v_src1));
+        f1 = v_cvt_f32(v_reinterpret_as_s32(v_src2));
+        f2 = v_cvt_f32(v_reinterpret_as_s32(v_src1s));
+        f3 = v_cvt_f32(v_reinterpret_as_s32(v_src2s));
+
+        f0 = op::r(f0, f1, scalar);
+        f2 = op::r(f2, f3, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f2);
+
+        r0 = op::pre(v_src2, r0);
+        r1 = op::pre(v_src2s, r1);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+
+    static inline void l(const int* src1, const T2* scalar, int* dst)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src1s = vx_load(src1 + step);
+
+        v_float32 f0, f1;
+        f0 = v_cvt_f32(v_src1);
+        f1 = v_cvt_f32(v_src1s);
+
+        f0 = op::r(f0, scalar);
+        f1 = op::r(f1, scalar);
+
+        v_int32 r0 = v_round(f0);
+        v_int32 r1 = v_round(f1);
+
+        r0 = op::pre(v_src1, r0);
+        r1 = op::pre(v_src1s, r1);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T2>
+struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
+{
+    typedef OP<float, T2, v_float32> op;
+    enum {step = v_float32::nlanes};
+
+    static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
+    {
+        v_float32 v_src1 = vx_load(src1);
+        v_float32 v_src2 = vx_load(src2);
+        v_float32 v_src1s = vx_load(src1 + step);
+        v_float32 v_src2s = vx_load(src2 + step);
+
+        v_float32 r0 = op::r(v_src1, v_src2, scalar);
+        v_float32 r1 = op::r(v_src1s, v_src2s, scalar);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+
+    static inline void l(const float* src1, const T2* scalar, float* dst)
+    {
+        v_float32 v_src1 = vx_load(src1);
+        v_float32 v_src1s = vx_load(src1 + step);
+
+        v_float32 r0 = op::r(v_src1, scalar);
+        v_float32 r1 = op::r(v_src1s, scalar);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+};
+#endif // CV_SIMD
+
+#if CV_SIMD_64F
+template<template<typename T1, typename T2, typename Tvec> class OP>
+struct scalar_loader_n<sizeof(int), OP, int, double, v_int32>
+{
+    typedef OP<int, float, v_int32> op;
+    typedef OP<double, double, v_float64> op64;
+    enum {step = v_int32::nlanes};
+
+    static inline void l(const int* src1, const int* src2, const double* scalar, int* dst)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src2 = vx_load(src2);
+        v_int32 v_src1s = vx_load(src1 + step);
+        v_int32 v_src2s = vx_load(src2 + step);
+
+        v_int32 r0 = r(v_src1, v_src2, scalar);
+        v_int32 r1 = r(v_src1s, v_src2s, scalar);
+
+        r0 = op::pre(v_src2, r0);
+        r1 = op::pre(v_src2s, r1);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+    static inline void l(const int* src1, const double* scalar, int* dst)
+    {
+        v_int32 v_src1 = vx_load(src1);
+        v_int32 v_src1s = vx_load(src1 + step);
+
+        v_int32 r0 = r(v_src1, scalar);
+        v_int32 r1 = r(v_src1s, scalar);
+
+        r0 = op::pre(v_src1, r0);
+        r1 = op::pre(v_src1s, r1);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+
+    static inline v_int32 r(const v_int32& a, const v_int32& b, const double* scalar)
+    {
+        v_float64 f0, f1, f2, f3;
+        f0 = v_cvt_f64(a);
+        f1 = v_cvt_f64_high(a);
+        f2 = v_cvt_f64(b);
+        f3 = v_cvt_f64_high(b);
+
+        v_float64 r0 = op64::r(f0, f2, scalar);
+        v_float64 r1 = op64::r(f1, f3, scalar);
+
+        return v_round(r0, r1);
+    }
+    static inline v_int32 r(const v_int32& a, const double* scalar)
+    {
+        v_float64 f0, f1;
+        f0 = v_cvt_f64(a);
+        f1 = v_cvt_f64_high(a);
+
+        v_float64 r0 = op64::r(f0, scalar);
+        v_float64 r1 = op64::r(f1, scalar);
+
+        return v_round(r0, r1);
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP>
+struct scalar_loader_n<sizeof(float), OP, float, double, v_float32>
+{
+    typedef OP<float, float, v_float32> op;
+    typedef OP<double, double, v_float64> op64;
+    enum {step = v_float32::nlanes};
+
+    static inline void l(const float* src1, const float* src2, const double* scalar, float* dst)
+    {
+        v_float32 v_src1 = vx_load(src1);
+        v_float32 v_src2 = vx_load(src2);
+        v_float32 v_src1s = vx_load(src1 + step);
+        v_float32 v_src2s = vx_load(src2 + step);
+
+        v_float32 r0 = r(v_src1, v_src2, scalar);
+        v_float32 r1 = r(v_src1s, v_src2s, scalar);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+    static inline void l(const float* src1, const double* scalar, float* dst)
+    {
+        v_float32 v_src1 = vx_load(src1);
+        v_float32 v_src1s = vx_load(src1 + step);
+
+        v_float32 r0 = r(v_src1, scalar);
+        v_float32 r1 = r(v_src1s, scalar);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const double* scalar)
+    {
+        v_float64 f0, f1, f2, f3;
+        f0 = v_cvt_f64(a);
+        f1 = v_cvt_f64_high(a);
+        f2 = v_cvt_f64(b);
+        f3 = v_cvt_f64_high(b);
+
+        v_float64 r0 = op64::r(f0, f2, scalar);
+        v_float64 r1 = op64::r(f1, f3, scalar);
+
+        return v_cvt_f32(r0, r1);
+    }
+    static inline v_float32 r(const v_float32& a, const double* scalar)
+    {
+        v_float64 f0, f1;
+        f0 = v_cvt_f64(a);
+        f1 = v_cvt_f64_high(a);
+
+        v_float64 r0 = op64::r(f0, scalar);
+        v_float64 r1 = op64::r(f1, scalar);
+
+        return v_cvt_f32(r0, r1);
+    }
+};
+
+template<template<typename T1, typename T2, typename Tvec> class OP>
+struct scalar_loader_n<sizeof(double), OP, double, double, v_float64>
+{
+    typedef OP<double, double, v_float64> op;
+    enum {step = v_float64::nlanes};
+
+    static inline void l(const double* src1, const double* src2, const double* scalar, double* dst)
+    {
+        v_float64 v_src1 = vx_load(src1);
+        v_float64 v_src2 = vx_load(src2);
+        v_float64 v_src1s = vx_load(src1 + step);
+        v_float64 v_src2s = vx_load(src2 + step);
+
+        v_float64 r0 = op::r(v_src1, v_src2, scalar);
+        v_float64 r1 = op::r(v_src1s, v_src2s, scalar);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+    static inline void l(const double* src1, const double* scalar, double* dst)
+    {
+        v_float64 v_src1 = vx_load(src1);
+        v_float64 v_src1s = vx_load(src1 + step);
+
+        v_float64 r0 = op::r(v_src1, scalar);
+        v_float64 r1 = op::r(v_src1s, scalar);
+
+        v_store(dst, r0);
+        v_store(dst + step, r1);
+    }
+};
+#endif // CV_SIMD_64F
+
+//////////////////////////// Loops /////////////////////////////////
+
+// dual source
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                 T1* dst, size_t step, int width, int height, const T2* scalar)
+{
+    typedef OP<T1, T2, Tvec> op;
+#if CV_SIMD
+    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
+                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+#endif // CV_SIMD
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+    #if CV_SIMD
+        for (; x <= width - wide_step; x += wide_step)
+        {
+            ldr::l(src1 + x, src2 + x, scalar, dst + x);
+        }
+    #endif // CV_SIMD
+
+    #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], src2[x], scalar);
+            T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2], scalar);
+            t1 = op::r(src1[x + 3], src2[x + 3], scalar);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+    #endif
+
+        for (; x < width; ++x)
+            dst[x] = op::r(src1[x], src2[x], scalar);
+    }
+
+    vx_cleanup();
+}
+
+// single source
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
+{
+    typedef OP<T1, T2, Tvec> op;
+#if CV_SIMD
+    typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
+                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+#endif // CV_SIMD
+
+    step1 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, dst += step)
+    {
+        int x = 0;
+
+    #if CV_SIMD
+        for (; x <= width - wide_step; x += wide_step)
+        {
+            ldr::l(src1 + x, scalar, dst + x);
+        }
+    #endif // CV_SIMD
+
+    #if CV_ENABLE_UNROLLED || CV_SIMD_WIDTH > 16
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], scalar);
+            T1 t1 = op::r(src1[x + 1], scalar);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], scalar);
+            t1 = op::r(src1[x + 3], scalar);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+    #endif
+
+        for (; x < width; ++x)
+            dst[x] = op::r(src1[x], scalar);
+    }
+
+    vx_cleanup();
+}
+
+#if !CV_SIMD_64F
+// dual source
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+static void scalar_loop_nosimd(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                 T1* dst, size_t step, int width, int height, const T2* scalar)
+{
+    typedef OP<T1, T2, Tvec> op;
+
+    step1 /= sizeof(T1);
+    step2 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, src2 += step2, dst += step)
+    {
+        int x = 0;
+
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], src2[x], scalar);
+            T1 t1 = op::r(src1[x + 1], src2[x + 1], scalar);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], src2[x + 2], scalar);
+            t1 = op::r(src1[x + 3], src2[x + 3], scalar);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+
+        for (; x < width; ++x)
+            dst[x] = op::r(src1[x], src2[x], scalar);
+    }
+}
+
+// single source
+template<template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
+static void scalar_loop_nosimd(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
+{
+    typedef OP<T1, T2, Tvec> op;
+
+    step1 /= sizeof(T1);
+    step  /= sizeof(T1);
+
+    for (; height--; src1 += step1, dst += step)
+    {
+        int x = 0;
+
+        for (; x <= width - 4; x += 4)
+        {
+            T1 t0 = op::r(src1[x], scalar);
+            T1 t1 = op::r(src1[x + 1], scalar);
+            dst[x] = t0; dst[x + 1] = t1;
+
+            t0 = op::r(src1[x + 2], scalar);
+            t1 = op::r(src1[x + 3], scalar);
+            dst[x + 2] = t0; dst[x + 3] = t1;
+        }
+
+        for (; x < width; ++x)
+            dst[x] = op::r(src1[x], scalar);
+    }
+}
+
+#define SCALAR_LOOP64F scalar_loop_nosimd
+#else
+#define SCALAR_LOOP64F scalar_loop
+#endif // !CV_SIMD_64F
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//=========================================================================
+// Multiply
+//=========================================================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+template<typename T1, typename Tvec>
+struct op_mul
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a * b; }
+    static inline T1 r(T1 a, T1 b)
+    { return saturate_cast<T1>(a * b); }
+};
+
+template<typename T1, typename T2, typename Tvec>
+struct op_mul_scale
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+    {
+        const v_float32 v_scalar = vx_setall_f32(*scalar);
+        return v_scalar * a * b;
+    }
+    static inline T1 r(T1 a, T1 b, const T2* scalar)
+    { return c_mul(a, b, *scalar); }
+    static inline Tvec pre(const Tvec&, const Tvec& res)
+    { return res; }
+};
+
+template<>
+struct op_mul_scale<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+    {
+        const v_float64 v_scalar = vx_setall_f64(*scalar);
+        return v_scalar * a * b;
+    }
+#endif
+    static inline double r(double a, double b, const double* scalar)
+    { return c_mul(a, b, *scalar); }
+    static inline v_float64 pre(const v_float64&, const v_float64& res)
+    { return res; }
+};
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename Tvec>
+static void mul_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+              T1* dst, size_t step, int width, int height, const double* scalar)
+{
+    float fscalar = (float)*scalar;
+    if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON)
+    {
+        bin_loop<op_mul, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        scalar_loop<op_mul_scale, T1, float, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, &fscalar);
+    }
+}
+
+template<typename T1, typename Tvec>
+static void mul_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                T1* dst, size_t step, int width, int height, const double* scalar)
+{
+    if (std::fabs(*scalar - 1.0) <= FLT_EPSILON)
+    {
+        bin_loop<op_mul, T1, Tvec>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_mul_scale, T1, double, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, scalar);
+    }
+}
+
+template<>
+void mul_loop_d<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+                                  double* dst, size_t step, int width, int height, const double* scalar)
+{
+    if (*scalar == 1.0)
+    {
+        BIN_LOOP64F<op_mul, double, v_float64>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_mul_scale, double, double, v_float64>(src1, step1, src2, step2,
+            dst, step, width, height, scalar);
+    }
+}
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////////////////////////////////////////////////////
+
+#undef SCALAR_ARGS
+#define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, const _T1* src2, size_t step2, \
+                          _T1* dst, size_t step, int width, int height
+
+#undef SCALAR_ARGS_PASS
+#define SCALAR_ARGS_PASS src1, step1, src2, step2, dst, step, width, height
+
+#undef DECLARE_SIMD_FUN
+#define DECLARE_SIMD_FUN(fun, _T1) void fun(SCALAR_ARGS(_T1), const double* scalar);
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...)                          \
+    void fun(SCALAR_ARGS(_T1), void* scalar)                             \
+    {                                                                    \
+        CV_INSTRUMENT_REGION();                                          \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun),                            \
+            SCALAR_ARGS_PASS, *(const double*)scalar)                    \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun),                      \
+            SCALAR_ARGS_PASS, *(const double*)scalar)                    \
+        CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar),  \
+            CV_CPU_DISPATCH_MODES_ALL);                                  \
+    }
+
+#undef DEFINE_SIMD_FUN
+#define DEFINE_SIMD_FUN(fun, _T1, _Tvec, op)           \
+    void fun(SCALAR_ARGS(_T1), const double* scalar)   \
+    {                                                  \
+        CV_INSTRUMENT_REGION();                        \
+        op<_T1, _Tvec>(SCALAR_ARGS_PASS, scalar);      \
+    }
+
+#undef DEFINE_NOSIMD_FUN
+#define DEFINE_NOSIMD_FUN(fun, _T1, _OP) \
+    DEFINE_SIMD_FUN(fun, _T1, v_float64, _OP)
+
+DEFINE_SIMD_SAT(mul, mul_loop)
+DEFINE_SIMD_F32(mul, mul_loop_d)
+DEFINE_SIMD_S32(mul, mul_loop_d)
+DEFINE_SIMD_F64(mul, mul_loop_d)
+
+//=========================================================================
+// Div
+//=========================================================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+template<typename T1, typename Tvec>
+struct op_div_f
+{
+    static inline Tvec r(const Tvec& a, const Tvec& b)
+    { return a / b; }
+    static inline T1 r(T1 a, T1 b)
+    { return a / b; }
+};
+
+template<typename T1, typename T2, typename Tvec>
+struct op_div_scale
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+    {
+        const v_float32 v_scalar = vx_setall_f32(*scalar);
+        return a * v_scalar / b;
+    }
+    static inline Tvec pre(const Tvec& denom, const Tvec& res)
+    {
+        const Tvec v_zero = Tvec();
+        return v_select(denom == v_zero, v_zero, res);
+    }
+    static inline T1 r(T1 a, T1 denom, const T2* scalar)
+    {
+        CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
+        return denom != (T1)0 ? c_div(a, denom, *scalar) : (T1)0;
+    }
+};
+
+template<>
+struct op_div_scale<float, float, v_float32>
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
+    {
+        const v_float32 v_scalar = vx_setall_f32(*scalar);
+        return a * v_scalar / b;
+    }
+    static inline float r(float a, float denom, const float* scalar)
+    { return c_div(a, denom, *scalar); }
+};
+
+template<>
+struct op_div_scale<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+    {
+        const v_float64 v_scalar = vx_setall_f64(*scalar);
+        return a * v_scalar / b;
+    }
+#endif
+    static inline double r(double a, double denom, const double* scalar)
+    { return c_div(a, denom, *scalar); }
+};
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename Tvec>
+static void div_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+              T1* dst, size_t step, int width, int height, const double* scalar)
+{
+    float fscalar = (float)*scalar;
+    // todo: add new intrinsics for integer divide
+    scalar_loop<op_div_scale, T1, float, Tvec>(src1, step1, src2, step2,
+        dst, step, width, height, &fscalar);
+}
+
+template<>
+void div_loop<float, v_float32>(const float* src1, size_t step1, const float* src2, size_t step2,
+                                float* dst, size_t step, int width, int height, const double* scalar)
+{
+    float fscalar = (float)*scalar;
+    if (std::fabs(fscalar - 1.0f) <= FLT_EPSILON)
+    {
+        bin_loop<op_div_f, float, v_float32>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_div_scale, float, float, v_float32>(src1, step1, src2, step2,
+            dst, step, width, height, &fscalar);
+    }
+}
+
+template<>
+void div_loop<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+                                double* dst, size_t step, int width, int height, const double* scalar)
+{
+    if (*scalar == 1.0)
+    {
+        BIN_LOOP64F<op_div_f, double, v_float64>(src1, step1, src2, step2, dst, step, width, height);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_div_scale, double, double, v_float64>(src1, step1, src2, step2,
+            dst, step, width, height, scalar);
+    }
+}
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////////////////////////////////////////////////////
+
+DEFINE_SIMD_ALL(div, div_loop)
+
+//=========================================================================
+// AddWeighted
+//=========================================================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+///// Add scale
+template<typename T1, typename T2, typename Tvec>
+struct op_add_scale
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
+    {
+        const v_float32 v_alpha = vx_setall_f32(*scalar);
+        return v_fma(a, v_alpha, b);
+    }
+    static inline T1 r(T1 a, T1 b, const T2* scalar)
+    { return c_add(a, b, *scalar); }
+    static inline Tvec pre(const Tvec&, const Tvec& res)
+    { return res; }
+};
+
+template<>
+struct op_add_scale<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
+    {
+        const v_float64 v_alpha = vx_setall_f64(*scalar);
+        return v_fma(a, v_alpha, b);
+    }
+#endif
+    static inline double r(double a, double b, const double* scalar)
+    { return c_add(a, b, *scalar); }
+    static inline v_float64 pre(const v_float64&, const v_float64& res)
+    { return res; }
+};
+
+///// Weighted sum
+template<typename T1, typename T2, typename Tvec>
+struct op_add_weighted
+{
+    static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
+    {
+        const v_float32 v_alpha = vx_setall_f32(scalars[0]);
+        const v_float32 v_beta  = vx_setall_f32(scalars[1]);
+        const v_float32 v_gamma = vx_setall_f32(scalars[2]);
+        return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
+    }
+    static inline T1 r(T1 a, T1 b, const T2* scalars)
+    { return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
+    static inline Tvec pre(const Tvec&, const Tvec& res)
+    { return res; }
+};
+
+template<>
+struct op_add_weighted<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalars)
+    {
+        const v_float64 v_alpha = vx_setall_f64(scalars[0]);
+        const v_float64 v_beta  = vx_setall_f64(scalars[1]);
+        const v_float64 v_gamma = vx_setall_f64(scalars[2]);
+        return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma));
+    }
+#endif
+    static inline double r(double a, double b, const double* scalars)
+    { return c_add(a, b, scalars[0], scalars[1], scalars[2]); }
+    static inline v_float64 pre(const v_float64&, const v_float64& res)
+    { return res; }
+};
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename Tvec>
+static void add_weighted_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                       T1* dst, size_t step, int width, int height, const double* scalars)
+{
+    float fscalars[] = {(float)scalars[0], (float)scalars[1], (float)scalars[2]};
+    if (fscalars[1] == 1.0f && fscalars[2] == 0.0f)
+    {
+        scalar_loop<op_add_scale, T1, float, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, fscalars);
+    }
+    else
+    {
+        scalar_loop<op_add_weighted, T1, float, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, fscalars);
+    }
+}
+
+template<typename T1, typename Tvec>
+static void add_weighted_loop_d(const T1* src1, size_t step1, const T1* src2, size_t step2,
+                         T1* dst, size_t step, int width, int height, const double* scalars)
+{
+    if (scalars[1] == 1.0 && scalars[2] == 0.0)
+    {
+        SCALAR_LOOP64F<op_add_scale, T1, double, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, scalars);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_add_weighted, T1, double, Tvec>(src1, step1, src2, step2,
+            dst, step, width, height, scalars);
+    }
+}
+
+template<>
+void add_weighted_loop_d<double, v_float64>(const double* src1, size_t step1, const double* src2, size_t step2,
+                                            double* dst, size_t step, int width, int height, const double* scalars)
+{
+    if (scalars[1] == 1.0 && scalars[2] == 0.0)
+    {
+        SCALAR_LOOP64F<op_add_scale, double, double, v_float64>(src1, step1, src2, step2,
+            dst, step, width, height, scalars);
+    }
+    else
+    {
+        SCALAR_LOOP64F<op_add_weighted, double, double, v_float64>(src1, step1, src2, step2,
+            dst, step, width, height, scalars);
+    }
+}
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////////////////////////////////////////////////////
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...)                          \
+    void fun(SCALAR_ARGS(_T1), void* scalar)                             \
+    {                                                                    \
+        CV_INSTRUMENT_REGION();                                          \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun),                            \
+            SCALAR_ARGS_PASS, (const double*)scalar)                     \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun),                      \
+            SCALAR_ARGS_PASS, (const double*)scalar)                     \
+        CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar),  \
+            CV_CPU_DISPATCH_MODES_ALL);                                  \
+    }
+
+DEFINE_SIMD_SAT(addWeighted, add_weighted_loop)
+DEFINE_SIMD_S32(addWeighted, add_weighted_loop_d)
+DEFINE_SIMD_F32(addWeighted, add_weighted_loop_d)
+DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
+
+//=======================================
+// Reciprocal
+//=======================================
+
+#ifdef ARITHM_DEFINITIONS_ONLY
+
+///////////////////////////// Operations //////////////////////////////////
+
+template<typename T1, typename T2, typename Tvec>
+struct op_recip
+{
+    static inline v_float32 r(const v_float32& a, const T2* scalar)
+    {
+        const v_float32 v_scalar = vx_setall_f32(*scalar);
+        return v_scalar / a;
+    }
+    static inline Tvec pre(const Tvec& denom, const Tvec& res)
+    {
+        const Tvec v_zero = Tvec();
+        return v_select(denom == v_zero, v_zero, res);
+    }
+    static inline T1 r(T1 denom, const T2* scalar)
+    {
+        CV_StaticAssert(std::numeric_limits<T1>::is_integer, "");
+        return denom != (T1)0 ? c_div(*scalar, denom) : (T1)0;
+    }
+};
+
+template<>
+struct op_recip<float, float, v_float32>
+{
+    static inline v_float32 r(const v_float32& a, const float* scalar)
+    {
+        const v_float32 v_scalar = vx_setall_f32(*scalar);
+        return v_scalar / a;
+    }
+    static inline float r(float denom, const float* scalar)
+    { return c_div(*scalar, denom); }
+};
+
+template<>
+struct op_recip<double, double, v_float64>
+{
+#if CV_SIMD_64F
+    static inline v_float64 r(const v_float64& a, const double* scalar)
+    {
+        const v_float64 v_scalar = vx_setall_f64(*scalar);
+        return v_scalar / a;
+    }
+#endif
+    static inline double r(double denom, const double* scalar)
+    { return c_div(*scalar, denom); }
+};
+
+//////////////////////////// Loops /////////////////////////////////
+
+template<typename T1, typename Tvec>
+static void recip_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const double* scalar)
+{
+    float fscalar = (float)*scalar;
+    scalar_loop<op_recip, T1, float, Tvec>(src1, step1, dst, step, width, height, &fscalar);
+}
+
+template<>
+void recip_loop<double, v_float64>(const double* src1, size_t step1, double* dst, size_t step, int width, int height, const double* scalar)
+{
+    SCALAR_LOOP64F<op_recip, double, double, v_float64>(src1, step1, dst, step, width, height, scalar);
+}
+
+#endif // ARITHM_DEFINITIONS_ONLY
+
+//////////////////////////////////////////////////////////////////////////
+
+#undef SCALAR_ARGS
+#define SCALAR_ARGS(_T1) const _T1* src1, size_t step1, _T1* dst, size_t step, int width, int height
+
+#undef SCALAR_ARGS_PASS
+#define SCALAR_ARGS_PASS src1, step1, dst, step, width, height
+
+#undef DISPATCH_SIMD_FUN
+#define DISPATCH_SIMD_FUN(fun, _T1, _Tvec, ...)                          \
+    void fun(const _T1*, size_t, SCALAR_ARGS(_T1), void* scalar)         \
+    {                                                                    \
+        CV_INSTRUMENT_REGION();                                          \
+        CALL_HAL(fun, __CV_CAT(cv_hal_, fun),                            \
+            SCALAR_ARGS_PASS, *(const double*)scalar)                    \
+        ARITHM_CALL_IPP(__CV_CAT(arithm_ipp_, fun),                      \
+            SCALAR_ARGS_PASS, *(const double*)scalar)                    \
+        CV_CPU_DISPATCH(fun, (SCALAR_ARGS_PASS, (const double*)scalar),  \
+            CV_CPU_DISPATCH_MODES_ALL);                                  \
+    }
+
+DEFINE_SIMD_ALL(recip, recip_loop)
+
+#ifndef ARITHM_DISPATCHING_ONLY
+    CV_CPU_OPTIMIZATION_NAMESPACE_END
+#endif
+
+#ifndef SIMD_GUARD
+    #define SIMD_GUARD
+#endif
+
+}} // cv::hal::
\ No newline at end of file
diff --git a/modules/core/src/arithm_core.hpp b/modules/core/src/arithm_core.hpp
deleted file mode 100644
index 7b7d6f7d85..0000000000
--- a/modules/core/src/arithm_core.hpp
+++ /dev/null
@@ -1,629 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_ARITHM_CORE_HPP__
-#define __OPENCV_ARITHM_CORE_HPP__
-
-#include "arithm_simd.hpp"
-
-namespace cv {
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
-};
-
-template<typename T> struct OpMin
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::min(a, b); }
-};
-
-template<typename T> struct OpMax
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::max(a, b); }
-};
-
-template<typename T> struct OpAbsDiff
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
-};
-
-// specializations to prevent "-0" results
-template<> struct OpAbsDiff<float>
-{
-    typedef float type1;
-    typedef float type2;
-    typedef float rtype;
-    float operator()(float a, float b) const { return std::abs(a - b); }
-};
-template<> struct OpAbsDiff<double>
-{
-    typedef double type1;
-    typedef double type2;
-    typedef double rtype;
-    double operator()(double a, double b) const { return std::abs(a - b); }
-};
-
-template<typename T> struct OpAnd
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a & b; }
-};
-
-template<typename T> struct OpOr
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a | b; }
-};
-
-template<typename T> struct OpXor
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a ^ b; }
-};
-
-template<typename T> struct OpNot
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T ) const { return ~a; }
-};
-
-//=============================================================================
-
-template<typename T, class Op, class VOp>
-void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2 || CV_NEON
-    VOp vop;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_NEON || CV_SSE2
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
-            {
-                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
-                r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
-                VLoadStore256<T>::store(dst + x, r0);
-            }
-        }
-#else
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-#endif // CV_SSE2
-            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
-            {
-                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
-                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
-                r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
-                r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
-                VLoadStore128<T>::store(dst + x               , r0);
-                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
-            }
-#if CV_SSE2
-        }
-#endif // CV_SSE2
-#endif // CV_AVX2
-#endif // CV_NEON || CV_SSE2
-
-#if CV_AVX2
-        // nothing
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
-            {
-                typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
-                r = vop(r, VLoadStore64<T>::load(src2 + x));
-                VLoadStore64<T>::store(dst + x, r);
-            }
-        }
-#endif
-
-#if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-#endif
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-template<typename T, class Op, class Op32>
-void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
-              T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2 || CV_NEON
-    Op32 op32;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
-            {
-                for( ; x <= width - 8; x += 8 )
-                {
-                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
-                    r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
-                    VLoadStore256Aligned<T>::store(dst + x, r0);
-                }
-            }
-        }
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
-            {
-                for( ; x <= width - 8; x += 8 )
-                {
-                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
-                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
-                    r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
-                    r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
-                    VLoadStore128Aligned<T>::store(dst + x    , r0);
-                    VLoadStore128Aligned<T>::store(dst + x + 4, r1);
-                }
-            }
-        }
-#endif // CV_AVX2
-
-#if CV_NEON || CV_SSE2
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            for( ; x <= width - 8; x += 8 )
-            {
-                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
-                r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
-                VLoadStore256<T>::store(dst + x, r0);
-            }
-        }
-#else
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-#endif // CV_SSE2
-            for( ; x <= width - 8; x += 8 )
-            {
-                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
-                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
-                r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
-                r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
-                VLoadStore128<T>::store(dst + x    , r0);
-                VLoadStore128<T>::store(dst + x + 4, r1);
-            }
-#if CV_SSE2
-        }
-#endif // CV_SSE2
-#endif // CV_AVX2
-#endif // CV_NEON || CV_SSE2
-
-#if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-#endif
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-
-template<typename T, class Op, class Op64>
-void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
-               T* dst, size_t step, int width, int height)
-{
-#if CV_SSE2
-    Op64 op64;
-#endif
-    Op op;
-
-    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
-
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
-            {
-                for( ; x <= width - 4; x += 4 )
-                {
-                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
-                    r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
-                    VLoadStore256Aligned<T>::store(dst + x, r0);
-                }
-            }
-        }
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
-            {
-                for( ; x <= width - 4; x += 4 )
-                {
-                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
-                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
-                    r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
-                    r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
-                    VLoadStore128Aligned<T>::store(dst + x    , r0);
-                    VLoadStore128Aligned<T>::store(dst + x + 2, r1);
-                }
-            }
-        }
-#endif
-
-        for( ; x <= width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-
-        for( ; x < width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-template<typename T> static void
-cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
-     uchar* dst, size_t step, int width, int height, int code)
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
-    {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
-
-    Cmp_SIMD<T> vop(code);
-
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = vop(src1, src2, dst, width);
-            #if CV_ENABLE_UNROLLED
-            for( ; x <= width - 4; x += 4 )
-            {
-                int t0, t1;
-                t0 = -(src1[x] > src2[x]) ^ m;
-                t1 = -(src1[x+1] > src2[x+1]) ^ m;
-                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
-                t0 = -(src1[x+2] > src2[x+2]) ^ m;
-                t1 = -(src1[x+3] > src2[x+3]) ^ m;
-                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
-            }
-            #endif
-            for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-            #if CV_ENABLE_UNROLLED
-            for( ; x <= width - 4; x += 4 )
-            {
-                int t0, t1;
-                t0 = -(src1[x] == src2[x]) ^ m;
-                t1 = -(src1[x+1] == src2[x+1]) ^ m;
-                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
-                t0 = -(src1[x+2] == src2[x+2]) ^ m;
-                t1 = -(src1[x+3] == src2[x+3]) ^ m;
-                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
-            }
-            #endif
-            for( ; x < width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
-}
-
-template<typename T, typename WT> static void
-mul_( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, WT scale )
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Mul_SIMD<T, WT> vop;
-
-    if( scale == (WT)1. )
-    {
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int i = vop(src1, src2, dst, width, scale);
-            #if CV_ENABLE_UNROLLED
-            for(; i <= width - 4; i += 4 )
-            {
-                T t0;
-                T t1;
-                t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
-                t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
-                dst[i  ] = t0;
-                dst[i+1] = t1;
-
-                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
-                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
-                dst[i+2] = t0;
-                dst[i+3] = t1;
-            }
-            #endif
-            for( ; i < width; i++ )
-                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
-        }
-    }
-    else
-    {
-        for( ; height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int i = vop(src1, src2, dst, width, scale);
-            #if CV_ENABLE_UNROLLED
-            for(; i <= width - 4; i += 4 )
-            {
-                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
-                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
-                dst[i] = t0; dst[i+1] = t1;
-
-                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
-                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
-                dst[i+2] = t0; dst[i+3] = t1;
-            }
-            #endif
-            for( ; i < width; i++ )
-                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
-        }
-    }
-}
-
-
-template<typename T> static void
-div_i( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, double scale )
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Div_SIMD<T> vop;
-    float scale_f = (float)scale;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int i = vop(src1, src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T num = src1[i], denom = src2[i];
-            T v = 0;
-            if (denom != 0)
-                v = saturate_cast<T>(num*scale_f/denom);
-            dst[i] = v;
-        }
-    }
-}
-
-template<typename T> static void
-div_f( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, int width, int height, double scale )
-{
-    T scale_f = (T)scale;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Div_SIMD<T> vop;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int i = vop(src1, src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T num = src1[i], denom = src2[i];
-            dst[i] = saturate_cast<T>(num*scale_f/denom);
-        }
-    }
-}
-
-template<typename T> static void
-recip_i( const T* src2, size_t step2,
-         T* dst, size_t step, int width, int height, double scale )
-{
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Recip_SIMD<T> vop;
-    float scale_f = (float)scale;
-
-    for( ; height--; src2 += step2, dst += step )
-    {
-        int i = vop(src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T denom = src2[i];
-            T v = 0;
-            if (denom != 0)
-                v = saturate_cast<T>(scale_f/denom);
-            dst[i] = v;
-        }
-    }
-}
-
-template<typename T> static void
-recip_f( const T* src2, size_t step2,
-         T* dst, size_t step, int width, int height, double scale )
-{
-    T scale_f = (T)scale;
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Recip_SIMD<T> vop;
-
-    for( ; height--; src2 += step2, dst += step )
-    {
-        int i = vop(src2, dst, width, scale);
-        for( ; i < width; i++ )
-        {
-            T denom = src2[i];
-            dst[i] = saturate_cast<T>(scale_f/denom);
-        }
-    }
-}
-
-template<typename T, typename WT> static void
-addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
-              T* dst, size_t step, int width, int height, void* _scalars )
-{
-    const double* scalars = (const double*)_scalars;
-    WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    AddWeighted_SIMD<T, WT> vop;
-
-    for( ; height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int x = vop(src1, src2, dst, width, alpha, beta, gamma);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= width - 4; x += 4 )
-        {
-            T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
-            T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
-            dst[x] = t0; dst[x+1] = t1;
-
-            t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
-            t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-        for( ; x < width; x++ )
-            dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
-    }
-}
-
-} // cv::
-
-
-#endif // __OPENCV_ARITHM_CORE_HPP__
diff --git a/modules/core/src/arithm_ipp.hpp b/modules/core/src/arithm_ipp.hpp
new file mode 100644
index 0000000000..4aa7d006e4
--- /dev/null
+++ b/modules/core/src/arithm_ipp.hpp
@@ -0,0 +1,417 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#if ARITHM_USE_IPP
+
+namespace cv { namespace hal {
+
+//=======================================
+// Arithmetic and logical operations
+// +, -, *, /, &, |, ^, ~, abs ...
+//=======================================
+
+#define ARITHM_IPP_BIN(fun, ...)                        \
+do {                                                    \
+    if (!CV_IPP_CHECK_COND)                             \
+        return 0;                                       \
+    if (height == 1)                                    \
+        step1 = step2 = step = width * sizeof(dst[0]);  \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__))   \
+    {                                                   \
+        CV_IMPL_ADD(CV_IMPL_IPP);                       \
+        return 1;                                       \
+    }                                                   \
+    setIppErrorStatus();                                \
+    return 0;                                           \
+} while(0)
+
+//=======================================
+// Addition
+//=======================================
+
+inline int arithm_ipp_add8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_8u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_16u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                             short* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_16s_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_add32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAdd_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_add8s(...)  0
+#define arithm_ipp_add32s(...) 0
+#define arithm_ipp_add64f(...) 0
+
+//=======================================
+// Subtract
+//=======================================
+
+inline int arithm_ipp_sub8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_8u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_16u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                            short* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_16s_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_sub32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                            float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiSub_32f_C1R, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_sub8s(...)  0
+#define arithm_ipp_sub32s(...) 0
+#define arithm_ipp_sub64f(...) 0
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define ARITHM_IPP_MIN_MAX(fun, type)                            \
+do {                                                             \
+    if (!CV_IPP_CHECK_COND)                                      \
+        return 0;                                                \
+    type* s1 = (type*)src1;                                      \
+    type* s2 = (type*)src2;                                      \
+    type* d  = dst;                                              \
+    if (height == 1)                                             \
+        step1 = step2 = step = width * sizeof(dst[0]);           \
+    int i = 0;                                                   \
+    for(; i < height; i++)                                       \
+    {                                                            \
+        if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width))    \
+            break;                                               \
+        s1 = (type*)((uchar*)s1 + step1);                        \
+        s2 = (type*)((uchar*)s2 + step2);                        \
+        d  = (type*)((uchar*)d + step);                          \
+    }                                                            \
+    if (i == height)                                             \
+    {                                                            \
+        CV_IMPL_ADD(CV_IMPL_IPP);                                \
+        return 1;                                                \
+    }                                                            \
+    setIppErrorStatus();                                         \
+    return 0;                                                    \
+} while(0)
+
+//=======================================
+// Max
+//=======================================
+
+inline int arithm_ipp_max8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_8u, uchar);
+}
+
+inline int arithm_ipp_max16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                             ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_16u, ushort);
+}
+
+inline int arithm_ipp_max32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_32f, float);
+}
+
+inline int arithm_ipp_max64f(const double* src1, size_t step1, const double* src2, size_t step2,
+                             double* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMaxEvery_64f, double);
+}
+
+#define arithm_ipp_max8s(...)  0
+#define arithm_ipp_max16s(...) 0
+#define arithm_ipp_max32s(...) 0
+
+//=======================================
+// Min
+//=======================================
+
+inline int arithm_ipp_min8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_8u, uchar);
+}
+
+inline int arithm_ipp_min16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                            ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_16u, ushort);
+}
+
+inline int arithm_ipp_min32f(const float* src1, size_t step1, const float* src2,size_t step2,
+                             float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_32f, float);
+}
+
+inline int arithm_ipp_min64f(const double* src1, size_t step1, const double* src2, size_t step2,
+                             double* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_MIN_MAX(ippsMinEvery_64f, double);
+}
+
+#define arithm_ipp_min8s(...)  0
+#define arithm_ipp_min16s(...) 0
+#define arithm_ipp_min32s(...) 0
+
+//=======================================
+// AbsDiff
+//=======================================
+
+inline int arithm_ipp_absdiff8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                                uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_absdiff16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                                ushort* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_absdiff32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                                float* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAbsDiff_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+#define arithm_ipp_absdiff8s(...)  0
+#define arithm_ipp_absdiff16s(...) 0
+#define arithm_ipp_absdiff32s(...) 0
+#define arithm_ipp_absdiff64f(...) 0
+
+//=======================================
+// Logical
+//=======================================
+
+inline int arithm_ipp_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiAnd_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiOr_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height)
+{
+    ARITHM_IPP_BIN(ippiXor_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_not8u(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height)
+{
+    if (!CV_IPP_CHECK_COND)
+        return 0;
+    if (height == 1)
+        step1 = step = width * sizeof(dst[0]);
+    if (0 <= CV_INSTRUMENT_FUN_IPP(ippiNot_8u_C1R, src1, (int)step1, dst, (int)step, ippiSize(width, height)))
+    {
+        CV_IMPL_ADD(CV_IMPL_IPP);
+        return 1;
+    }
+    setIppErrorStatus();
+    return 0;
+}
+
+//=======================================
+// Compare
+//=======================================
+
+#define ARITHM_IPP_CMP(fun, ...)                          \
+do {                                                      \
+    if (!CV_IPP_CHECK_COND)                               \
+        return 0;                                         \
+    IppCmpOp op = arithm_ipp_convert_cmp(cmpop);          \
+    if (op < 0)                                           \
+        return 0;                                         \
+    if (height == 1)                                      \
+        step1 = step2 = step = width * sizeof(dst[0]);    \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__, op)) \
+    {                                                     \
+        CV_IMPL_ADD(CV_IMPL_IPP);                         \
+        return 1;                                         \
+    }                                                     \
+    setIppErrorStatus();                                  \
+    return 0;                                             \
+} while(0)
+
+inline IppCmpOp arithm_ipp_convert_cmp(int cmpop)
+{
+    switch(cmpop)
+    {
+        case CMP_EQ: return ippCmpEq;
+        case CMP_GT: return ippCmpGreater;
+        case CMP_GE: return ippCmpGreaterEq;
+        case CMP_LT: return ippCmpLess;
+        case CMP_LE: return ippCmpLessEq;
+        default:     return (IppCmpOp)-1;
+    }
+}
+
+inline int arithm_ipp_cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                             uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_16s_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+inline int arithm_ipp_cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                             uchar* dst, size_t step, int width, int height, int cmpop)
+{
+    ARITHM_IPP_CMP(ippiCompare_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_cmp8s(...)  0
+#define arithm_ipp_cmp32s(...) 0
+#define arithm_ipp_cmp64f(...) 0
+
+//=======================================
+// Multiply
+//=======================================
+
+#define ARITHM_IPP_MUL(fun, ...)                      \
+do {                                                  \
+    if (!CV_IPP_CHECK_COND)                           \
+        return 0;                                     \
+    float fscale = (float)scale;                      \
+    if (std::fabs(fscale - 1) > FLT_EPSILON)          \
+        return 0;                                     \
+    if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \
+    {                                                 \
+        CV_IMPL_ADD(CV_IMPL_IPP);                     \
+        return 1;                                     \
+    }                                                 \
+    setIppErrorStatus();                              \
+    return 0;                                         \
+} while(0)
+
+inline int arithm_ipp_mul8u(const uchar *src1, size_t step1, const uchar *src2, size_t step2,
+                            uchar *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_8u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+inline int arithm_ipp_mul16u(const ushort *src1, size_t step1, const ushort *src2, size_t step2,
+                            ushort *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_16u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_mul16s(const short *src1, size_t step1, const short *src2, size_t step2,
+                            short *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_16s_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0);
+}
+
+inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2, size_t step2,
+                            float *dst, size_t step, int width, int height, double scale)
+{
+    ARITHM_IPP_MUL(ippiMul_32f_C1R, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height));
+}
+
+#define arithm_ipp_mul8s(...)  0
+#define arithm_ipp_mul32s(...) 0
+#define arithm_ipp_mul64f(...) 0
+
+//=======================================
+// Div
+//=======================================
+
+#define arithm_ipp_div8u(...)  0
+#define arithm_ipp_div8s(...)  0
+#define arithm_ipp_div16u(...) 0
+#define arithm_ipp_div16s(...) 0
+#define arithm_ipp_div32s(...) 0
+#define arithm_ipp_div32f(...) 0
+#define arithm_ipp_div64f(...) 0
+
+//=======================================
+// AddWeighted
+//=======================================
+
+#define arithm_ipp_addWeighted8u(...)  0
+#define arithm_ipp_addWeighted8s(...)  0
+#define arithm_ipp_addWeighted16u(...) 0
+#define arithm_ipp_addWeighted16s(...) 0
+#define arithm_ipp_addWeighted32s(...) 0
+#define arithm_ipp_addWeighted32f(...) 0
+#define arithm_ipp_addWeighted64f(...) 0
+
+//=======================================
+// Reciprocial
+//=======================================
+
+#define arithm_ipp_recip8u(...)  0
+#define arithm_ipp_recip8s(...)  0
+#define arithm_ipp_recip16u(...) 0
+#define arithm_ipp_recip16s(...) 0
+#define arithm_ipp_recip32s(...) 0
+#define arithm_ipp_recip32f(...) 0
+#define arithm_ipp_recip64f(...) 0
+
+/** empty block in case if you have "fun"
+#define arithm_ipp_8u(...)  0
+#define arithm_ipp_8s(...)  0
+#define arithm_ipp_16u(...) 0
+#define arithm_ipp_16s(...) 0
+#define arithm_ipp_32s(...) 0
+#define arithm_ipp_32f(...) 0
+#define arithm_ipp_64f(...) 0
+**/
+
+}} // cv::hal::
+
+#define ARITHM_CALL_IPP(fun, ...)       \
+{                                       \
+    if (__CV_EXPAND(fun(__VA_ARGS__)))  \
+        return;                         \
+}
+
+#endif // ARITHM_USE_IPP
+
+
+#if !ARITHM_USE_IPP
+#define ARITHM_CALL_IPP(...)
+#endif
\ No newline at end of file
diff --git a/modules/core/src/arithm_simd.hpp b/modules/core/src/arithm_simd.hpp
deleted file mode 100644
index 98a0126d20..0000000000
--- a/modules/core/src/arithm_simd.hpp
+++ /dev/null
@@ -1,2009 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_ARITHM_SIMD_HPP__
-#define __OPENCV_ARITHM_SIMD_HPP__
-
-namespace cv {
-
-struct NOP {};
-
-#if CV_SSE2 || CV_NEON
-#define IF_SIMD(op) op
-#else
-#define IF_SIMD(op) NOP
-#endif
-
-
-#if CV_SSE2 || CV_NEON
-
-#define FUNCTOR_TEMPLATE(name)          \
-    template<typename T> struct name {}
-
-FUNCTOR_TEMPLATE(VLoadStore128);
-#if CV_SSE2
-FUNCTOR_TEMPLATE(VLoadStore64);
-FUNCTOR_TEMPLATE(VLoadStore128Aligned);
-#if CV_AVX2
-FUNCTOR_TEMPLATE(VLoadStore256);
-FUNCTOR_TEMPLATE(VLoadStore256Aligned);
-#endif
-#endif
-
-#endif
-
-#if CV_AVX2
-
-#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
-    template <>                                                                                  \
-    struct name<template_arg>{                                                                   \
-        typedef register_type reg_type;                                                          \
-        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
-        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
-    }
-
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
-    template <>                                                                     \
-    struct name<template_arg>{                                                      \
-        typedef register_type reg_type;                                             \
-        static reg_type load(const template_arg * p) { return load_body (p); }      \
-        static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
-    }
-
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore256<template_arg>::reg_type operator()(                      \
-                        const VLoadStore256<template_arg>::reg_type & a,       \
-                        const VLoadStore256<template_arg>::reg_type & b) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
-
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore256<template_arg>::reg_type operator()(                      \
-                        const VLoadStore256<template_arg>::reg_type & a,       \
-                        const VLoadStore256<template_arg>::reg_type &  ) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
-FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
-
-
-static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
-                                                           0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
-static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
-                                                           0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
-        return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
-        __m256i d = _mm256_subs_epi8(a, b);
-        __m256i m = _mm256_cmpgt_epi8(b, a);
-        return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
-        return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
-        __m256i M = _mm256_max_epi16(a, b);
-        __m256i m = _mm256_min_epi16(a, b);
-        return _mm256_subs_epi16(M, m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
-        __m256i d = _mm256_sub_epi32(a, b);
-        __m256i m = _mm256_cmpgt_epi32(b, a);
-        return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
-        return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
-        return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
-    );
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
-
-#elif CV_SSE2
-
-#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                                  \
-    struct name<template_arg>{                                                                   \
-        typedef register_type reg_type;                                                          \
-        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
-        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
-    }
-
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                \
-    struct name<template_arg>{                                                 \
-        typedef register_type reg_type;                                        \
-        static reg_type load(const template_arg * p) { return load_body (p); } \
-        static void store(template_arg * p, reg_type v) { store_body (p, v); } \
-    }
-
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore128<template_arg>::reg_type operator()(                      \
-                        const VLoadStore128<template_arg>::reg_type & a,       \
-                        const VLoadStore128<template_arg>::reg_type & b) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
-
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore128<template_arg>::reg_type operator()(                      \
-                        const VLoadStore128<template_arg>::reg_type & a,       \
-                        const VLoadStore128<template_arg>::reg_type &  ) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
-FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
-FUNCTOR_CLOSURE_2arg(VMin, schar,
-        __m128i m = _mm_cmpgt_epi8(a, b);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
-FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int,
-        __m128i m = _mm_cmpgt_epi32(a, b);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
-FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
-FUNCTOR_CLOSURE_2arg(VMax, schar,
-        __m128i m = _mm_cmpgt_epi8(b, a);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int,
-        __m128i m = _mm_cmpgt_epi32(b, a);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
-FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
-
-
-static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
-static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
-        return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
-        __m128i d = _mm_subs_epi8(a, b);
-        __m128i m = _mm_cmpgt_epi8(b, a);
-        return _mm_subs_epi8(_mm_xor_si128(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
-        return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
-        __m128i M = _mm_max_epi16(a, b);
-        __m128i m = _mm_min_epi16(a, b);
-        return _mm_subs_epi16(M, m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
-        __m128i d = _mm_sub_epi32(a, b);
-        __m128i m = _mm_cmpgt_epi32(b, a);
-        return _mm_sub_epi32(_mm_xor_si128(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
-        return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
-        return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
-    );
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
-#endif
-
-#if CV_NEON
-
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                \
-    struct name<template_arg>{                                                 \
-        typedef register_type reg_type;                                        \
-        static reg_type load(const template_arg * p) { return load_body (p);}; \
-        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
-    }
-
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
-    template<>                                                         \
-    struct name<template_arg>                                          \
-    {                                                                  \
-        VLoadStore128<template_arg>::reg_type operator()(              \
-                        VLoadStore128<template_arg>::reg_type a,       \
-                        VLoadStore128<template_arg>::reg_type b) const \
-        {                                                              \
-            return body;                                               \
-        };                                                             \
-    }
-
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
-    template<>                                                         \
-    struct name<template_arg>                                          \
-    {                                                                  \
-        VLoadStore128<template_arg>::reg_type operator()(              \
-                        VLoadStore128<template_arg>::reg_type a,       \
-                        VLoadStore128<template_arg>::reg_type  ) const \
-        {                                                              \
-            return body;                                               \
-        };                                                             \
-    }
-
-FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
-FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
-FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
-FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
-FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
-FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
-#endif
-
-
-template <typename T>
-struct Cmp_SIMD
-{
-    explicit Cmp_SIMD(int)
-    {
-    }
-
-    int operator () (const T *, const T *, uchar *, int) const
-    {
-        return 0;
-    }
-};
-
-#if CV_NEON
-
-template <>
-struct Cmp_SIMD<schar>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdupq_n_u8(255);
-    }
-
-    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_LE)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_NE)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
-
-        return x;
-    }
-
-    int code;
-    uint8x16_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<ushort>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<int>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
-                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<float>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const float * src1, const float * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
-                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-#elif CV_SSE2
-
-template <>
-struct Cmp_SIMD<schar>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-
-        v_mask = _mm_set1_epi8(-1);
-    }
-
-    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 16; x += 16)
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
-        else if (code == CMP_LE)
-            for ( ; x <= width - 16; x += 16)
-            {
-                __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 16; x += 16)
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
-        else if (code == CMP_NE)
-            for ( ; x <= width - 16; x += 16)
-            {
-                __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
-            }
-
-        return x;
-    }
-
-    int code;
-    __m128i v_mask;
-    bool haveSSE;
-};
-
-template <>
-struct Cmp_SIMD<int>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        // CV_Assert(code == CMP_GT || code == CMP_LE ||
-        //           code == CMP_EQ || code == CMP_NE);
-
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-
-        v_mask = _mm_set1_epi32(0xffffffff);
-    }
-
-    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
-
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
-            }
-
-        return x;
-    }
-
-    int code;
-    __m128i v_mask;
-    bool haveSSE;
-};
-
-#endif
-
-
-template <typename T, typename WT>
-struct Mul_SIMD
-{
-    int operator() (const T *, const T *, T *, int, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_NEON
-
-template <>
-struct Mul_SIMD<uchar, float>
-{
-    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
-                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1_u8(dst + x, vqmovn_u16(v_dst));
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
-                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1_u8(dst + x, vqmovn_u16(v_dst));
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<schar, float>
-{
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
-                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1_s8(dst + x, vqmovn_s16(v_dst));
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
-                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1_s8(dst + x, vqmovn_s16(v_dst));
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<ushort, float>
-{
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1q_u16(dst + x, v_dst);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1q_u16(dst + x, v_dst);
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<short, float>
-{
-    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1q_s16(dst + x, v_dst);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1q_s16(dst + x, v_dst);
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<float, float>
-{
-    int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1q_f32(dst + x, v_dst1);
-                vst1q_f32(dst + x + 4, v_dst2);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-
-                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                vst1q_f32(dst + x, v_dst1);
-                vst1q_f32(dst + x + 4, v_dst2);
-            }
-        }
-
-        return x;
-    }
-};
-
-#elif CV_SSE2
-
-#if CV_SSE4_1
-
-template <>
-struct Mul_SIMD<ushort, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
-    }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale != 1.0f )
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
-                                           _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
-                                           _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-#endif
-
-template <>
-struct Mul_SIMD<schar, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
-
-                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
-            }
-        else
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
-
-                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-template <>
-struct Mul_SIMD<short, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale != 1.0f )
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-#endif
-
-template <typename T>
-struct Div_SIMD
-{
-    int operator() (const T *, const T *, T *, int, double) const
-    {
-        return 0;
-    }
-};
-
-template <typename T>
-struct Recip_SIMD
-{
-    int operator() (const T *, T *, int, double) const
-    {
-        return 0;
-    }
-};
-
-
-#if CV_SIMD128
-
-template <>
-struct Div_SIMD<uchar>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src1 = v_load_expand(src1 + x);
-            v_uint16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_uint32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
-            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<schar>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src1 = v_load_expand(src1 + x);
-            v_int16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_int32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<ushort>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src1 = v_load(src1 + x);
-            v_uint16x8 v_src2 = v_load(src2 + x);
-
-            v_uint32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
-            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Div_SIMD<short>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src1 = v_load(src1 + x);
-            v_int16x8 v_src2 = v_load(src2 + x);
-
-            v_int32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Div_SIMD<int>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int32x4 v_zero = v_setzero_s32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int32x4 t0 = v_load(src1 + x);
-            v_int32x4 t1 = v_load(src1 + x + 4);
-            v_int32x4 t2 = v_load(src2 + x);
-            v_int32x4 t3 = v_load(src2 + x + 4);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
-
-            res0 = v_select(t2 == v_zero, v_zero, res0);
-            res1 = v_select(t3 == v_zero, v_zero, res1);
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<float>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_float32x4 f0 = v_load(src1 + x);
-            v_float32x4 f1 = v_load(src1 + x + 4);
-            v_float32x4 f2 = v_load(src2 + x);
-            v_float32x4 f3 = v_load(src2 + x + 4);
-
-            v_float32x4 res0 = f0 * v_scale / f2;
-            v_float32x4 res1 = f1 * v_scale / f3;
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-///////////////////////// RECIPROCAL //////////////////////
-
-template <>
-struct Recip_SIMD<uchar>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const uchar * src2, uchar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_uint32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<schar>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const schar * src2, schar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_int32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<ushort>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const ushort * src2, ushort * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src2 = v_load(src2 + x);
-
-            v_uint32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<short>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const short * src2, short * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src2 = v_load(src2 + x);
-
-            v_int32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<int>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const int * src2, int * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int32x4 v_zero = v_setzero_s32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int32x4 t0 = v_load(src2 + x);
-            v_int32x4 t1 = v_load(src2 + x + 4);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
-
-            res0 = v_select(t0 == v_zero, v_zero, res0);
-            res1 = v_select(t1 == v_zero, v_zero, res1);
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<float>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const float * src2, float * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_float32x4 f0 = v_load(src2 + x);
-            v_float32x4 f1 = v_load(src2 + x + 4);
-
-            v_float32x4 res0 = v_scale / f0;
-            v_float32x4 res1 = v_scale / f1;
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-
-template <>
-struct Div_SIMD<double>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float64x2 v_scale = v_setall_f64(scale);
-
-        for ( ; x <= width - 4; x += 4)
-        {
-            v_float64x2 f0 = v_load(src1 + x);
-            v_float64x2 f1 = v_load(src1 + x + 2);
-            v_float64x2 f2 = v_load(src2 + x);
-            v_float64x2 f3 = v_load(src2 + x + 2);
-
-            v_float64x2 res0 = f0 * v_scale / f2;
-            v_float64x2 res1 = f1 * v_scale / f3;
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 2, res1);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<double>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = hasSIMD128(); }
-
-    int operator() (const double * src2, double * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float64x2 v_scale = v_setall_f64(scale);
-
-        for ( ; x <= width - 4; x += 4)
-        {
-            v_float64x2 f0 = v_load(src2 + x);
-            v_float64x2 f1 = v_load(src2 + x + 2);
-
-            v_float64x2 res0 = v_scale / f0;
-            v_float64x2 res1 = v_scale / f1;
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 2, res1);
-        }
-
-        return x;
-    }
-};
-
-#endif
-
-#endif
-
-
-template <typename T, typename WT>
-struct AddWeighted_SIMD
-{
-    int operator() (const T *, const T *, T *, int, WT, WT, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_SSE2
-
-template <>
-struct AddWeighted_SIMD<schar, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
-
-            __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-            __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
-
-            __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
-                                              _mm_cvtps_epi32(v_dstf1));
-
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
-        }
-
-        return x;
-    }
-
-    bool haveSSE2;
-};
-
-template <>
-struct AddWeighted_SIMD<short, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
-                                                                   _mm_cvtps_epi32(v_dstf1)));
-        }
-
-        return x;
-    }
-
-    bool haveSSE2;
-};
-
-#if CV_SSE4_1
-
-template <>
-struct AddWeighted_SIMD<ushort, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
-    }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE4_1)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
-                                                                    _mm_cvtps_epi32(v_dstf1)));
-        }
-
-        return x;
-    }
-
-    bool haveSSE4_1;
-};
-
-#endif
-
-#elif CV_NEON
-
-template <>
-struct AddWeighted_SIMD<schar, float>
-{
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32 (gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int8x8_t in1 = vld1_s8(src1 + x);
-            int16x8_t in1_16 = vmovl_s8(in1);
-            float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
-            float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
-
-            int8x8_t in2 = vld1_s8(src2+x);
-            int16x8_t in2_16 = vmovl_s8(in2);
-            float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
-            float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
-
-            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
-            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
-            out_f_l = vaddq_f32(out_f_l, g);
-            out_f_h = vaddq_f32(out_f_h, g);
-
-            int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
-            int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
-
-            int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
-            int8x8_t out = vqmovn_s16(out_16);
-
-            vst1_s8(dst + x, out);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct AddWeighted_SIMD<ushort, float>
-{
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
-            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
-            uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
-            v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
-            uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct AddWeighted_SIMD<short, float>
-{
-    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
-            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
-            int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
-            v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
-            int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
-        }
-
-        return x;
-    }
-};
-
-#endif
-
-}
-
-#endif // __OPENCV_ARITHM_SIMD_HPP__
diff --git a/modules/core/src/command_line_parser.cpp b/modules/core/src/command_line_parser.cpp
index 3d55404e37..af97232db6 100644
--- a/modules/core/src/command_line_parser.cpp
+++ b/modules/core/src/command_line_parser.cpp
@@ -119,7 +119,7 @@ static void from_str(const String& str, Param type, void* dst)
 
 void CommandLineParser::getByName(const String& name, bool space_delete, Param type, void* dst) const
 {
-    CV_TRY
+    try
     {
         for (size_t i = 0; i < impl->data.size(); i++)
         {
@@ -144,19 +144,20 @@ void CommandLineParser::getByName(const String& name, bool space_delete, Param t
             }
         }
     }
-    CV_CATCH (Exception, e)
+    catch (const Exception& e)
     {
         impl->error = true;
         impl->error_message = impl->error_message + "Parameter '"+ name + "': " + e.err + "\n";
         return;
     }
+
     CV_Error_(Error::StsBadArg, ("undeclared key '%s' requested", name.c_str()));
 }
 
 
 void CommandLineParser::getByIndex(int index, bool space_delete, Param type, void* dst) const
 {
-    CV_TRY
+    try
     {
         for (size_t i = 0; i < impl->data.size(); i++)
         {
@@ -176,12 +177,13 @@ void CommandLineParser::getByIndex(int index, bool space_delete, Param type, voi
             }
         }
     }
-    CV_CATCH(Exception, e)
+    catch (const Exception& e)
     {
         impl->error = true;
         impl->error_message = impl->error_message + format("Parameter #%d: ", index) + e.err + "\n";
         return;
     }
+
     CV_Error_(Error::StsBadArg, ("undeclared position %d requested", index));
 }
 
@@ -455,13 +457,14 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
     std::vector<String> vec;
     String word = "";
     bool begin = false;
+
     while (!str.empty())
     {
         if (str[0] == fs)
         {
             if (begin == true)
             {
-                CV_THROW (cv::Exception(CV_StsParseError,
+                throw cv::Exception(CV_StsParseError,
                          String("error in split_range_string(")
                          + str
                          + String(", ")
@@ -470,7 +473,7 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
                          + String(1, ss)
                          + String(")"),
                          "", __FILE__, __LINE__
-                         ));
+                         );
             }
             begin = true;
             word = "";
@@ -481,7 +484,7 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
         {
             if (begin == false)
             {
-                CV_THROW (cv::Exception(CV_StsParseError,
+                throw cv::Exception(CV_StsParseError,
                          String("error in split_range_string(")
                          + str
                          + String(", ")
@@ -490,7 +493,7 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
                          + String(1, ss)
                          + String(")"),
                          "", __FILE__, __LINE__
-                         ));
+                         );
             }
             begin = false;
             vec.push_back(word);
@@ -505,7 +508,7 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
 
     if (begin == true)
     {
-        CV_THROW (cv::Exception(CV_StsParseError,
+        throw cv::Exception(CV_StsParseError,
                  String("error in split_range_string(")
                  + str
                  + String(", ")
@@ -514,8 +517,9 @@ std::vector<String> CommandLineParser::Impl::split_range_string(const String& _s
                  + String(1, ss)
                  + String(")"),
                  "", __FILE__, __LINE__
-                ));
+                );
     }
+
     return vec;
 }
 
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index e31e1b2c43..766772002c 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -442,7 +442,6 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
         _dst.create( dims, size, _type );
     Mat dst = _dst.getMat();
 
-
     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
     double scale[] = {alpha, beta};
     int cn = channels();
@@ -450,7 +449,7 @@ void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta)
 
     if( dims <= 2 )
     {
-        Size sz = getContinuousSize(src, dst, cn);
+        Size sz = getContinuousSize2D(src, dst, cn);
         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
     }
     else
@@ -511,7 +510,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst )
 
     if( src.dims <= 2 )
     {
-        Size sz = getContinuousSize(src, dst, cn);
+        Size sz = getContinuousSize2D(src, dst, cn);
         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0);
     }
     else
diff --git a/modules/core/src/convert_scale.cpp b/modules/core/src/convert_scale.cpp
index ba9c023211..9bd3c4baf8 100644
--- a/modules/core/src/convert_scale.cpp
+++ b/modules/core/src/convert_scale.cpp
@@ -426,7 +426,7 @@ void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, doubl
 
     if( src.dims <= 2 )
     {
-        Size sz = getContinuousSize(src, dst, cn);
+        Size sz = getContinuousSize2D(src, dst, cn);
         func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale );
     }
     else
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index fc4f363c7b..6d6aaff112 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -287,23 +287,19 @@ void Mat::copyTo( OutputArray _dst ) const
 
         if( rows > 0 && cols > 0 )
         {
-            // For some cases (with vector) dst.size != src.size, so force to column-based form
-            // It prevents memory corruption in case of column-based src
-            if (_dst.isVector())
-                dst = dst.reshape(0, (int)dst.total());
+            Mat src = *this;
+            Size sz = getContinuousSize2D(src, dst, (int)elemSize());
+            CV_CheckGE(sz.width, 0, "");
 
-            const uchar* sptr = data;
+            const uchar* sptr = src.data;
             uchar* dptr = dst.data;
 
 #if IPP_VERSION_X100 >= 201700
-            CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiCopy_8u_C1R_L, sptr, (int)step, dptr, (int)dst.step, ippiSizeL((int)(cols*elemSize()), rows)) >= 0)
+            CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiCopy_8u_C1R_L, sptr, (int)src.step, dptr, (int)dst.step, ippiSizeL(sz.width, sz.height)) >= 0)
 #endif
 
-            Size sz = getContinuousSize(*this, dst);
-            size_t len = sz.width*elemSize();
-
-            for( ; sz.height--; sptr += step, dptr += dst.step )
-                memcpy( dptr, sptr, len );
+            for (; sz.height--; sptr += src.step, dptr += dst.step)
+                memcpy(dptr, sptr, sz.width);
         }
         return;
     }
@@ -403,8 +399,9 @@ void Mat::copyTo( OutputArray _dst, InputArray _mask ) const
 
     if( dims <= 2 )
     {
-        Size sz = getContinuousSize(*this, dst, mask, mcn);
-        copymask(data, step, mask.data, mask.step, dst.data, dst.step, sz, &esz);
+        Mat src = *this;
+        Size sz = getContinuousSize2D(src, dst, mask, mcn);
+        copymask(src.data, src.step, mask.data, mask.step, dst.data, dst.step, sz, &esz);
         return;
     }
 
diff --git a/modules/core/src/glob.cpp b/modules/core/src/glob.cpp
index 76f20e2c48..f213bcbc7b 100644
--- a/modules/core/src/glob.cpp
+++ b/modules/core/src/glob.cpp
@@ -231,7 +231,7 @@ static void glob_rec(const cv::String& directory, const cv::String& wildchart, s
     if ((dir = opendir (directory.c_str())) != 0)
     {
         /* find all the files and directories within directory */
-        CV_TRY
+        try
         {
             struct dirent *ent;
             while ((ent = readdir (dir)) != 0)
@@ -255,10 +255,10 @@ static void glob_rec(const cv::String& directory, const cv::String& wildchart, s
                     result.push_back(entry);
             }
         }
-        CV_CATCH_ALL
+        catch (...)
         {
             closedir(dir);
-            CV_RETHROW();
+            throw;
         }
         closedir(dir);
     }
diff --git a/modules/core/src/lda.cpp b/modules/core/src/lda.cpp
index 618b9bb451..fb87ff8789 100644
--- a/modules/core/src/lda.cpp
+++ b/modules/core/src/lda.cpp
@@ -866,7 +866,7 @@ private:
         d = alloc_1d<double> (n);
         e = alloc_1d<double> (n);
         ort = alloc_1d<double> (n);
-        CV_TRY {
+        try {
             // Reduce to Hessenberg form.
             orthes();
             // Reduce Hessenberg to real Schur form.
@@ -884,10 +884,10 @@ private:
             // Deallocate the memory by releasing all internal working data.
             release();
         }
-        CV_CATCH_ALL
+        catch (...)
         {
             release();
-            CV_RETHROW();
+            throw;
         }
     }
 
diff --git a/modules/core/src/lut.cpp b/modules/core/src/lut.cpp
index 3b45b906e0..f5dc205082 100644
--- a/modules/core/src/lut.cpp
+++ b/modules/core/src/lut.cpp
@@ -120,11 +120,11 @@ static bool openvx_LUT(Mat src, Mat dst, Mat _lut)
         lut.copyFrom(_lut);
         ivx::IVX_CHECK_STATUS(vxuTableLookup(ctx, ia, lut, ib));
     }
-    catch (ivx::RuntimeError & e)
+    catch (const ivx::RuntimeError& e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (ivx::WrapperError & e)
+    catch (const ivx::WrapperError& e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 7e69695b4d..a4e5263aa8 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -1489,7 +1489,7 @@ bool checkRange(InputArray _src, bool quiet, Point* pt, double minVal, double ma
     {
         int i, loc = 0;
         int cn = src.channels();
-        Size size = getContinuousSize( src, cn );
+        Size size = getContinuousSize2D(src, cn);
 
         if( depth == CV_32F )
         {
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index a8fe6fb193..6824bb5a41 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -416,7 +416,7 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
     }
 
     *this = m;
-    CV_TRY
+    try
     {
         if( _rowRange != Range::all() && _rowRange != Range(0,rows) )
         {
@@ -436,10 +436,10 @@ Mat::Mat(const Mat& m, const Range& _rowRange, const Range& _colRange)
             flags |= SUBMATRIX_FLAG;
         }
     }
-    CV_CATCH_ALL
+    catch(...)
     {
         release();
-        CV_RETHROW();
+        throw;
     }
 
     updateContinuityFlag();
@@ -943,4 +943,77 @@ int Mat::checkVector(int _elemChannels, int _depth, bool _requireContinuous) con
     ? (int)(total()*channels()/_elemChannels) : -1;
 }
 
+
+static inline Size getContinuousSize_(int flags, int cols, int rows, int widthScale)
+{
+    int64 sz = (int64)cols * rows * widthScale;
+    bool has_int_overflow = sz >= INT_MAX;
+    bool isContiguous = (flags & Mat::CONTINUOUS_FLAG) != 0;
+    return (isContiguous && !has_int_overflow)
+            ? Size((int)sz, 1)
+            : Size(cols * widthScale, rows);
+}
+
+Size getContinuousSize2D(Mat& m1, int widthScale)
+{
+    CV_CheckLE(m1.dims, 2, "");
+    return getContinuousSize_(m1.flags,
+                              m1.cols, m1.rows, widthScale);
+}
+Size getContinuousSize2D(Mat& m1, Mat& m2, int widthScale)
+{
+    CV_CheckLE(m1.dims, 2, "");
+    CV_CheckLE(m2.dims, 2, "");
+    const Size sz1 = m1.size();
+    if (sz1 != m2.size())  // reshape all matrixes to the same size (#4159)
+    {
+        size_t total_sz = m1.total();
+        CV_CheckEQ(total_sz, m2.total(), "");
+        bool is_m1_vector = m1.cols == 1 || m1.rows == 1;
+        bool is_m2_vector = m2.cols == 1 || m2.rows == 1;
+        CV_Assert(is_m1_vector); CV_Assert(is_m2_vector);
+        int total = (int)total_sz;  // vector-column
+        bool isContiguous = ((m1.flags & m2.flags) & Mat::CONTINUOUS_FLAG) != 0;
+        bool has_int_overflow = ((int64)total_sz * widthScale) >= INT_MAX;
+        if (isContiguous && !has_int_overflow)
+            total = 1; // vector-row
+        m1 = m1.reshape(0, total);
+        m2 = m2.reshape(0, total);
+        CV_Assert(m1.cols == m2.cols && m1.rows == m2.rows);
+        return Size(m1.cols * widthScale, m1.rows);
+    }
+    return getContinuousSize_(m1.flags & m2.flags,
+                              m1.cols, m1.rows, widthScale);
+}
+
+Size getContinuousSize2D(Mat& m1, Mat& m2, Mat& m3, int widthScale)
+{
+    CV_CheckLE(m1.dims, 2, "");
+    CV_CheckLE(m2.dims, 2, "");
+    CV_CheckLE(m3.dims, 2, "");
+    const Size sz1 = m1.size();
+    if (sz1 != m2.size() || sz1 != m3.size())  // reshape all matrixes to the same size (#4159)
+    {
+        size_t total_sz = m1.total();
+        CV_CheckEQ(total_sz, m2.total(), "");
+        CV_CheckEQ(total_sz, m3.total(), "");
+        bool is_m1_vector = m1.cols == 1 || m1.rows == 1;
+        bool is_m2_vector = m2.cols == 1 || m2.rows == 1;
+        bool is_m3_vector = m3.cols == 1 || m3.rows == 1;
+        CV_Assert(is_m1_vector); CV_Assert(is_m2_vector); CV_Assert(is_m3_vector);
+        int total = (int)total_sz;  // vector-column
+        bool isContiguous = ((m1.flags & m2.flags & m3.flags) & Mat::CONTINUOUS_FLAG) != 0;
+        bool has_int_overflow = ((int64)total_sz * widthScale) >= INT_MAX;
+        if (isContiguous && !has_int_overflow)
+            total = 1; // vector-row
+        m1 = m1.reshape(0, total);
+        m2 = m2.reshape(0, total);
+        m3 = m3.reshape(0, total);
+        CV_Assert(m1.cols == m2.cols && m1.rows == m2.rows && m1.cols == m3.cols && m1.rows == m3.rows);
+        return Size(m1.cols * widthScale, m1.rows);
+    }
+    return getContinuousSize_(m1.flags & m2.flags & m3.flags,
+                              m1.cols, m1.rows, widthScale);
+}
+
 } // cv::
diff --git a/modules/core/src/mean.cpp b/modules/core/src/mean.cpp
index e11fa4e8f6..da514a6f29 100644
--- a/modules/core/src/mean.cpp
+++ b/modules/core/src/mean.cpp
@@ -654,11 +654,11 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv
                     pstddev[c] = 0;
             }
         }
-        catch (ivx::RuntimeError & e)
+        catch (const ivx::RuntimeError & e)
         {
             VX_DbgThrow(e.what());
         }
-        catch (ivx::WrapperError & e)
+        catch (const ivx::WrapperError & e)
         {
             VX_DbgThrow(e.what());
         }
diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp
index 50ce4ce883..da75e20a1d 100644
--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
@@ -439,11 +439,11 @@ static bool openvx_minMaxIdx(Mat &src, double* minVal, double* maxVal, int* minI
             ofs2idx(src, maxidx, maxIdx);
         }
     }
-    catch (ivx::RuntimeError & e)
+    catch (const ivx::RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (ivx::WrapperError & e)
+    catch (const ivx::WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index ee21561539..d0d74d0672 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -894,11 +894,11 @@ bool useOpenCL()
     CoreTLSData* data = getCoreTlsData().get();
     if( data->useOpenCL < 0 )
     {
-        CV_TRY
+        try
         {
             data->useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
         }
-        CV_CATCH_ALL
+        catch (...)
         {
             data->useOpenCL = 0;
         }
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 3aee8486b5..0b9430092c 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -86,7 +86,6 @@
 #include "opencv2/core/sse_utils.hpp"
 #include "opencv2/core/neon_utils.hpp"
 #include "opencv2/core/vsx_utils.hpp"
-#include "arithm_core.hpp"
 #include "hal_replacement.hpp"
 
 #define GET_OPTIMIZED(func) (func)
@@ -106,6 +105,102 @@ extern const uchar g_Saturate8u[];
 #define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
 #define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
 
+template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
+};
+
+template<typename T> struct OpMin
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::min(a, b); }
+};
+
+template<typename T> struct OpMax
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::max(a, b); }
+};
+
+template<typename T> struct OpAbsDiff
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
+};
+
+// specializations to prevent "-0" results
+template<> struct OpAbsDiff<float>
+{
+    typedef float type1;
+    typedef float type2;
+    typedef float rtype;
+    float operator()(float a, float b) const { return std::abs(a - b); }
+};
+template<> struct OpAbsDiff<double>
+{
+    typedef double type1;
+    typedef double type2;
+    typedef double rtype;
+    double operator()(double a, double b) const { return std::abs(a - b); }
+};
+
+template<typename T> struct OpAnd
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a & b; }
+};
+
+template<typename T> struct OpOr
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a | b; }
+};
+
+template<typename T> struct OpXor
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a ^ b; }
+};
+
+template<typename T> struct OpNot
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T ) const { return ~a; }
+};
+
 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
 { return CV_FAST_CAST_8U(a + b); }
 
@@ -149,47 +244,12 @@ BinaryFunc getCopyMaskFunc(size_t esz);
 /* maximal average node_count/hash_size ratio beyond which hash table is resized */
 #define  CV_SPARSE_HASH_RATIO    3
 
-inline Size getContinuousSize_( int flags, int cols, int rows, int widthScale )
-{
-    int64 sz = (int64)cols * rows * widthScale;
-    return (flags & Mat::CONTINUOUS_FLAG) != 0 &&
-        (int)sz == sz ? Size((int)sz, 1) : Size(cols * widthScale, rows);
-}
-
-inline Size getContinuousSize( const Mat& m1, int widthScale=1 )
-{
-    return getContinuousSize_(m1.flags,
-                              m1.cols, m1.rows, widthScale);
-}
-
-inline Size getContinuousSize( const Mat& m1, const Mat& m2, int widthScale=1 )
-{
-    return getContinuousSize_(m1.flags & m2.flags,
-                              m1.cols, m1.rows, widthScale);
-}
-
-inline Size getContinuousSize( const Mat& m1, const Mat& m2,
-                               const Mat& m3, int widthScale=1 )
-{
-    return getContinuousSize_(m1.flags & m2.flags & m3.flags,
-                              m1.cols, m1.rows, widthScale);
-}
-
-inline Size getContinuousSize( const Mat& m1, const Mat& m2,
-                               const Mat& m3, const Mat& m4,
-                               int widthScale=1 )
-{
-    return getContinuousSize_(m1.flags & m2.flags & m3.flags & m4.flags,
-                              m1.cols, m1.rows, widthScale);
-}
-
-inline Size getContinuousSize( const Mat& m1, const Mat& m2,
-                               const Mat& m3, const Mat& m4,
-                               const Mat& m5, int widthScale=1 )
-{
-    return getContinuousSize_(m1.flags & m2.flags & m3.flags & m4.flags & m5.flags,
-                              m1.cols, m1.rows, widthScale);
-}
+// There is some mess in code with vectors representation.
+// Both vector-column / vector-rows are used with dims=2 (as Mat2D always).
+// Reshape matrices if neccessary (in case of vectors) and returns size with scaled width.
+Size getContinuousSize2D(Mat& m1, int widthScale=1);
+Size getContinuousSize2D(Mat& m1, Mat& m2, int widthScale=1);
+Size getContinuousSize2D(Mat& m1, Mat& m2, Mat& m3, int widthScale=1);
 
 void setSize( Mat& m, int _dims, const int* _sz, const size_t* _steps, bool autoSteps=false );
 void finalizeHdr(Mat& m);
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 777111721a..9f82a15654 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -1029,7 +1029,7 @@ void error( const Exception& exc )
         *p = 0;
     }
 
-    CV_THROW(exc);
+    throw exc;
 #ifdef __GNUC__
 # if !defined __clang__ && !defined __APPLE__
     // this suppresses this warning: "noreturn" function does return [enabled by default]
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index 836076741c..2d1df2c602 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -367,11 +367,11 @@ UMat Mat::getUMat(AccessFlag accessFlags, UMatUsageFlags usageFlags) const
         new_u->originalUMatData = u;
     }
     bool allocated = false;
-    CV_TRY
+    try
     {
         allocated = UMat::getStdAllocator()->allocate(new_u, accessFlags, usageFlags);
     }
-    CV_CATCH(cv::Exception, e)
+    catch (const cv::Exception& e)
     {
         fprintf(stderr, "Exception: %s\n", e.what());
     }
@@ -442,12 +442,12 @@ void UMat::create(int d, const int* _sizes, int _type, UMatUsageFlags _usageFlag
             a = a0;
             a0 = Mat::getDefaultAllocator();
         }
-        CV_TRY
+        try
         {
             u = a->allocate(dims, size, _type, 0, step.p, ACCESS_RW /* ignored */, usageFlags);
             CV_Assert(u != 0);
         }
-        CV_CATCH_ALL
+        catch(...)
         {
             if(a != a0)
                 u = a0->allocate(dims, size, _type, 0, step.p, ACCESS_RW /* ignored */, usageFlags);
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 40d282b1c2..b28929c582 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -119,11 +119,15 @@ template <typename R> struct Data
             d[i] += (LaneType)m;
         return *this;
     }
-    void fill(LaneType val)
+    void fill(LaneType val, int s, int c = R::nlanes)
     {
-        for (int i = 0; i < R::nlanes; ++i)
+        for (int i = s; i < c; ++i)
             d[i] = val;
     }
+    void fill(LaneType val)
+    {
+        fill(val, 0);
+    }
     void reverse()
     {
         for (int i = 0; i < R::nlanes / 2; ++i)
@@ -739,6 +743,23 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    TheTest & test_absdiffs()
+    {
+        Data<R> dataA(std::numeric_limits<LaneType>::max()),
+                dataB(std::numeric_limits<LaneType>::min());
+        dataA[0] = (LaneType)-1;
+        dataB[0] = 1;
+        dataA[1] = 2;
+        dataB[1] = (LaneType)-2;
+        R a = dataA, b = dataB;
+        Data<R> resC = v_absdiffs(a, b);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            EXPECT_EQ(saturate_cast<LaneType>(std::abs(dataA[i] - dataB[i])), resC[i]);
+        }
+        return *this;
+    }
+
     TheTest & test_reduce()
     {
         Data<R> dataA;
@@ -874,6 +895,81 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    // v_uint8 only
+    TheTest & test_pack_b()
+    {
+        // 16-bit
+        Data<R> dataA, dataB;
+        dataB.fill(0, R::nlanes / 2);
+
+        R a = dataA, b = dataB;
+        Data<R> maskA = a == b, maskB = a != b;
+
+        a = maskA; b = maskB;
+        Data<R> res  = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b));
+        for (int i = 0; i < v_uint16::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 2], res[i]);
+            EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]);
+        }
+
+        // 32-bit
+        Data<R> dataC, dataD;
+        dataD.fill(0, R::nlanes / 2);
+
+        R c = dataC, d = dataD;
+        Data<R> maskC = c == d, maskD = c != d;
+
+        c = maskC; d = maskD;
+        res = v_pack_b
+        (
+            v_reinterpret_as_u32(a), v_reinterpret_as_u32(b),
+            v_reinterpret_as_u32(c), v_reinterpret_as_u32(d)
+        );
+
+        for (int i = 0; i < v_uint32::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 4], res[i]);
+            EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]);
+            EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]);
+            EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]);
+        }
+
+        // 64-bit
+        Data<R> dataE, dataF, dataG(0), dataH(0xFF);
+        dataF.fill(0, R::nlanes / 2);
+
+        R e = dataE, f = dataF, g = dataG, h = dataH;
+        Data<R> maskE = e == f, maskF = e != f;
+
+        e = maskE; f = maskF;
+        res = v_pack_b
+        (
+            v_reinterpret_as_u64(a), v_reinterpret_as_u64(b),
+            v_reinterpret_as_u64(c), v_reinterpret_as_u64(d),
+            v_reinterpret_as_u64(e), v_reinterpret_as_u64(f),
+            v_reinterpret_as_u64(g), v_reinterpret_as_u64(h)
+        );
+
+        for (int i = 0; i < v_uint64::nlanes; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ(maskA[i * 8], res[i]);
+            EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]);
+            EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]);
+            EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]);
+
+            EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]);
+            EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]);
+            EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]);
+            EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]);
+        }
+
+        return *this;
+    }
+
     TheTest & test_unpack()
     {
         Data<R> dataA, dataB;
@@ -1228,6 +1324,7 @@ void test_hal_intrin_uint8()
         .test_popcount()
         .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
         .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
+        .test_pack_b()
         .test_unpack()
         .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
         .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
@@ -1259,6 +1356,7 @@ void test_hal_intrin_int8()
         .test_logic()
         .test_min_max()
         .test_absdiff()
+        .test_absdiffs()
         .test_abs()
         .test_mask()
         .test_popcount()
@@ -1317,6 +1415,7 @@ void test_hal_intrin_int16()
         .test_logic()
         .test_min_max()
         .test_absdiff()
+        .test_absdiffs()
         .test_abs()
         .test_reduce()
         .test_mask()
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 32b83d431b..b8a29cafe9 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -1930,5 +1930,36 @@ TEST(Core_InputArray, support_CustomType)
     }
 }
 
+TEST(Core_Vectors, issue_13078)
+{
+    float floats_[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    std::vector<float> floats(floats_, floats_ + 8);
+    std::vector<int> ints(4);
+
+    Mat m(4, 1, CV_32FC1, floats.data(), sizeof(floats[0]) * 2);
+
+    m.convertTo(ints, CV_32S);
+
+    ASSERT_EQ(1, ints[0]);
+    ASSERT_EQ(3, ints[1]);
+    ASSERT_EQ(5, ints[2]);
+    ASSERT_EQ(7, ints[3]);
+}
+
+TEST(Core_Vectors, issue_13078_workaround)
+{
+    float floats_[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    std::vector<float> floats(floats_, floats_ + 8);
+    std::vector<int> ints(4);
+
+    Mat m(4, 1, CV_32FC1, floats.data(), sizeof(floats[0]) * 2);
+
+    m.convertTo(Mat(ints), CV_32S);
+
+    ASSERT_EQ(1, ints[0]);
+    ASSERT_EQ(3, ints[1]);
+    ASSERT_EQ(5, ints[2]);
+    ASSERT_EQ(7, ints[3]);
+}
 
 }} // namespace
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index d0dc9dfb28..baab007d1a 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1891,44 +1891,46 @@ struct Net::Impl
                 }
 
                 // fuse convolution layer followed by eltwise + relu
-                if ( IS_DNN_OPENCL_TARGET(preferableTarget) )
+                if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" )
                 {
                     Ptr<EltwiseLayer> nextEltwiseLayer;
                     if( nextData )
                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
 
-                    if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 )
+                    if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
+                        nextData->inputBlobsId.size() == 2 )
                     {
                         LayerData *eltwiseData = nextData;
-                        // go down from the second input and find the first non-skipped layer.
-                        LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[1].lid];
-                        CV_Assert(downLayerData);
-                        while (downLayerData->skip)
-                        {
-                            downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
-                        }
-                        CV_Assert(downLayerData);
 
-                        // second input layer is current layer.
-                        if ( ld.id == downLayerData->id )
+                        // Eltwise layer has two inputs. We need to determine which
+                        // is a base convolution layer and which could be used as it's bias.
+                        LayerData* biasLayerData = 0;
+                        for (int i = 0; i < 2; ++i)
                         {
-                            // go down from the first input and find the first non-skipped layer
-                            downLayerData = &layers[eltwiseData->inputBlobsId[0].lid];
+                            LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid];
+                            CV_Assert(downLayerData);
                             while (downLayerData->skip)
                             {
-                                if ( !downLayerData->type.compare("Eltwise") )
-                                    downLayerData = &layers[downLayerData->inputBlobsId[1].lid];
-                                else
+                                if (downLayerData->inputBlobsId.size() == 1)
                                     downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
+                                else
+                                {
+                                    downLayerData = 0;
+                                    break;
+                                }
                             }
-
-                            Ptr<ConvolutionLayer> convLayer = downLayerData->layerInstance.dynamicCast<ConvolutionLayer>();
-
-                            //  first input layer is convolution layer
-                            if( !convLayer.empty() && eltwiseData->consumers.size() == 1 )
+                            if (downLayerData && ld.id == downLayerData->id)
+                            {
+                                biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid];
+                                break;
+                            }
+                        }
+                        CV_Assert(biasLayerData);
+                        {
+                            if( eltwiseData->consumers.size() == 1 )
                             {
                                 // fuse eltwise + activation layer
-                                LayerData *firstConvLayerData = downLayerData;
+                                if (biasLayerData->id < ld.id)
                                 {
                                     nextData = &layers[eltwiseData->consumers[0].lid];
                                     lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
@@ -1942,8 +1944,8 @@ struct Net::Impl
                                              !nextData->type.compare("Power")) &&
                                             currLayer->setActivation(nextActivLayer) )
                                     {
-                                        CV_Assert(firstConvLayerData->outputBlobsWrappers.size() == 1 && ld.inputBlobsWrappers.size() == 1);
-                                        ld.inputBlobsWrappers.push_back(firstConvLayerData->outputBlobsWrappers[0]);
+                                        CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
+                                        ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
                                         printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
                                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
                                         eltwiseData->skip = true;
@@ -1994,9 +1996,6 @@ struct Net::Impl
                 }
             }
 
-            if (preferableBackend != DNN_BACKEND_OPENCV)
-                continue;  // Go to the next layer.
-
             // the optimization #2. if there is no layer that takes max pooling layer's computed
             // max indices (and only some semantical segmentation networks might need this;
             // many others only take the maximum values), then we switch the max pooling
@@ -3184,7 +3183,7 @@ void Net::setHalideScheduler(const String& scheduler)
 int64 Net::getPerfProfile(std::vector<double>& timings)
 {
     timings = std::vector<double>(impl->layersTimings.begin() + 1, impl->layersTimings.end());
-    int64 total = std::accumulate(timings.begin(), timings.end(), 0);
+    int64 total = (int64)std::accumulate(timings.begin(), timings.end(), 0.0);
     return total;
 }
 
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 3ec5c662e0..40a87dbbe4 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -96,7 +96,6 @@ public:
         else if (params.has("pooled_w") || params.has("pooled_h"))
         {
             type = ROI;
-            computeMaxIdx = false;
             pooledSize.width = params.get<uint32_t>("pooled_w", 1);
             pooledSize.height = params.get<uint32_t>("pooled_h", 1);
         }
@@ -142,6 +141,7 @@ public:
 #ifdef HAVE_OPENCL
         poolOp.release();
 #endif
+        computeMaxIdx = type == MAX;
     }
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
@@ -193,19 +193,14 @@ public:
             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
         }
 
-        for (size_t ii = 0; ii < inputs.size(); ii++)
-        {
-            UMat& inpMat = inputs[ii];
-            int out_index = (type == MAX) ? 2 : 1;
-            UMat& outMat = outputs[out_index * ii];
-            UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
+        CV_Assert_N(inputs.size() == 1, !outputs.empty(), !computeMaxIdx || outputs.size() == 2);
+        UMat& inpMat = inputs[0];
+        UMat& outMat = outputs[0];
+        UMat maskMat = computeMaxIdx ? outputs[1] : UMat();
 
-            CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
+        CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
 
-            if (!poolOp->Forward(inpMat, outMat, maskMat))
-                return false;
-        }
-        return true;
+        return poolOp->Forward(inpMat, outMat, maskMat);
     }
 #endif
 
@@ -232,9 +227,12 @@ public:
         switch (type)
         {
             case MAX:
-                CV_Assert_N(inputs.size() == 1, outputs.size() == 2);
-                maxPooling(inputs[0], outputs[0], outputs[1]);
+            {
+                CV_Assert_N(inputs.size() == 1, !computeMaxIdx || outputs.size() == 2);
+                Mat mask = computeMaxIdx ? outputs[1] : Mat();
+                maxPooling(inputs[0], outputs[0], mask);
                 break;
+            }
             case AVE:
                 CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
                 avePooling(inputs[0], outputs[0]);
@@ -951,7 +949,10 @@ public:
             dims[0] = inputs[1][0];  // Number of proposals;
             dims[1] = psRoiOutChannels;
         }
-        outputs.assign(type == MAX ? 2 : 1, shape(dims, 4));
+
+        int numOutputs = requiredOutputs ? requiredOutputs : (type == MAX ? 2 : 1);
+        CV_Assert(numOutputs == 1 || (numOutputs == 2 && type == MAX));
+        outputs.assign(numOutputs, shape(dims, 4));
 
         return false;
     }
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index b10c1388f3..f98d78c3bb 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -358,7 +358,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)
         (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16))
         throw SkipTestException("");
 
-    for (int i = 1; i < 2; ++i)
+    for (int i = 0; i < 2; ++i)
     {
         std::string proto = findDataFile("dnn/" + names[i] + ".pbtxt", false);
         std::string model = findDataFile("dnn/" + names[i] + ".pb", false);
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index 79c2a26334..828a0c5443 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -401,11 +401,11 @@ static bool openvx_FAST(InputArray _img, std::vector<KeyPoint>& keypoints,
         img.swapHandle();
 #endif
     }
-    catch (RuntimeError & e)
+    catch (const RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (WrapperError & e)
+    catch (const WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/imgcodecs/src/bitstrm.cpp b/modules/imgcodecs/src/bitstrm.cpp
index 86d2bc659b..2f5b44dfeb 100644
--- a/modules/imgcodecs/src/bitstrm.cpp
+++ b/modules/imgcodecs/src/bitstrm.cpp
@@ -99,7 +99,7 @@ void  RBaseStream::readBlock()
     {
         if( m_block_pos == 0 && m_current < m_end )
             return;
-        CV_THROW (RBS_THROW_EOS);
+        throw RBS_THROW_EOS;
     }
 
     fseek( m_file, m_block_pos, SEEK_SET );
@@ -107,7 +107,7 @@ void  RBaseStream::readBlock()
     m_end = m_start + readed;
 
     if( readed == 0 || m_current >= m_end )
-        CV_THROW (RBS_THROW_EOS);
+        throw RBS_THROW_EOS;
 }
 
 
diff --git a/modules/imgcodecs/src/exif.cpp b/modules/imgcodecs/src/exif.cpp
index 22b4f224ce..4dbaf43e81 100644
--- a/modules/imgcodecs/src/exif.cpp
+++ b/modules/imgcodecs/src/exif.cpp
@@ -80,15 +80,14 @@ ExifReader::~ExifReader()
  */
 bool ExifReader::parse()
 {
-    CV_TRY {
+    try {
         m_exif = getExif();
         if( !m_exif.empty() )
         {
             return true;
         }
         return false;
-    } CV_CATCH (ExifParsingError, e) {
-        CV_UNUSED(e);
+    } catch (ExifParsingError&) {
         return false;
     }
 }
@@ -152,11 +151,11 @@ std::map<int, ExifEntry_t > ExifReader::getExif()
             case COM:
                 bytesToSkip = getFieldSize();
                 if (bytesToSkip < markerSize) {
-                    CV_THROW (ExifParsingError());
+                    throw ExifParsingError();
                 }
                 m_stream.seekg( static_cast<long>( bytesToSkip - markerSize ), m_stream.cur );
                 if ( m_stream.fail() ) {
-                    CV_THROW (ExifParsingError());
+                    throw ExifParsingError();
                 }
                 break;
 
@@ -167,12 +166,12 @@ std::map<int, ExifEntry_t > ExifReader::getExif()
             case APP1: //actual Exif Marker
                 exifSize = getFieldSize();
                 if (exifSize <= offsetToTiffHeader) {
-                    CV_THROW (ExifParsingError());
+                    throw ExifParsingError();
                 }
                 m_data.resize( exifSize - offsetToTiffHeader );
                 m_stream.seekg( static_cast<long>( offsetToTiffHeader ), m_stream.cur );
                 if ( m_stream.fail() ) {
-                    CV_THROW (ExifParsingError());
+                    throw ExifParsingError();
                 }
                 m_stream.read( reinterpret_cast<char*>(&m_data[0]), exifSize - offsetToTiffHeader );
                 exifFound = true;
@@ -416,7 +415,7 @@ std::string ExifReader::getString(const size_t offset) const
         dataOffset = getU32( offset + 8 );
     }
     if (dataOffset > m_data.size() || dataOffset + size > m_data.size()) {
-        CV_THROW (ExifParsingError());
+        throw ExifParsingError();
     }
     std::vector<uint8_t>::const_iterator it = m_data.begin() + dataOffset;
     std::string result( it, it + size ); //copy vector content into result
@@ -433,7 +432,7 @@ std::string ExifReader::getString(const size_t offset) const
 uint16_t ExifReader::getU16(const size_t offset) const
 {
     if (offset + 1 >= m_data.size())
-        CV_THROW (ExifParsingError());
+        throw ExifParsingError();
 
     if( m_format == INTEL )
     {
@@ -451,7 +450,7 @@ uint16_t ExifReader::getU16(const size_t offset) const
 uint32_t ExifReader::getU32(const size_t offset) const
 {
     if (offset + 3 >= m_data.size())
-        CV_THROW (ExifParsingError());
+        throw ExifParsingError();
 
     if( m_format == INTEL )
     {
diff --git a/modules/imgcodecs/src/grfmt_bmp.cpp b/modules/imgcodecs/src/grfmt_bmp.cpp
index f20f186660..3d083b3211 100644
--- a/modules/imgcodecs/src/grfmt_bmp.cpp
+++ b/modules/imgcodecs/src/grfmt_bmp.cpp
@@ -89,7 +89,7 @@ bool  BmpDecoder::readHeader()
     else if( !m_strm.open( m_filename ))
         return false;
 
-    CV_TRY
+    try
     {
         m_strm.skip( 10 );
         m_offset = m_strm.getDWord();
@@ -173,9 +173,9 @@ bool  BmpDecoder::readHeader()
             }
         }
     }
-    CV_CATCH_ALL
+    catch(...)
     {
-        CV_RETHROW();
+        throw;
     }
     // in 32 bit case alpha channel is used - so require CV_8UC4 type
     m_type = iscolor ? (m_bpp == 32 ? CV_8UC4 : CV_8UC3 ) : CV_8UC1;
@@ -225,7 +225,7 @@ bool  BmpDecoder::readData( Mat& img )
     }
     uchar *src = _src.data(), *bgr = _bgr.data();
 
-    CV_TRY
+    try
     {
         m_strm.setPos( m_offset );
 
@@ -490,9 +490,9 @@ decode_rle8_bad: ;
             CV_Error(cv::Error::StsError, "Invalid/unsupported mode");
         }
     }
-    CV_CATCH_ALL
+    catch(...)
     {
-        CV_RETHROW();
+        throw;
     }
 
     return result;
diff --git a/modules/imgcodecs/src/grfmt_pam.cpp b/modules/imgcodecs/src/grfmt_pam.cpp
index a0dc171f23..a97e25532e 100644
--- a/modules/imgcodecs/src/grfmt_pam.cpp
+++ b/modules/imgcodecs/src/grfmt_pam.cpp
@@ -379,25 +379,25 @@ bool  PAMDecoder::readHeader()
     }
     else if( !m_strm.open( m_filename ))
         return false;
-    CV_TRY
+    try
     {
         byte = m_strm.getByte();
         if( byte != 'P' )
-            CV_THROW( RBS_BAD_HEADER );
+            throw RBS_BAD_HEADER;
 
         byte = m_strm.getByte();
         if (byte != '7')
-            CV_THROW( RBS_BAD_HEADER );
+            throw RBS_BAD_HEADER;
 
         byte = m_strm.getByte();
         if (byte != '\n' && byte != '\r')
-            CV_THROW( RBS_BAD_HEADER );
+            throw RBS_BAD_HEADER;
 
         uint i;
         memset (&flds, 0x00, sizeof (struct parsed_fields));
         do {
             if (!ReadPAMHeaderLine(m_strm, fieldtype, value))
-                CV_THROW( RBS_BAD_HEADER );
+                throw RBS_BAD_HEADER;
             switch (fieldtype) {
                 case PAM_HEADER_NONE:
                 case PAM_HEADER_COMMENT:
@@ -407,32 +407,32 @@ bool  PAMDecoder::readHeader()
                     break;
                 case PAM_HEADER_HEIGHT:
                     if (flds.height)
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     if (!ParseNumber (value, &m_height))
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     flds.height = true;
                     break;
                 case PAM_HEADER_WIDTH:
                     if (flds.width)
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     if (!ParseNumber (value, &m_width))
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     flds.width = true;
                     break;
                 case PAM_HEADER_DEPTH:
                     if (flds.depth)
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     if (!ParseNumber (value, &m_channels))
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     flds.depth = true;
                     break;
                 case PAM_HEADER_MAXVAL:
                     if (flds.maxval)
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     if (!ParseNumber (value, &m_maxval))
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     if ( m_maxval > 65535 )
-                        CV_THROW( RBS_BAD_HEADER );
+                        throw RBS_BAD_HEADER;
                     if ( m_maxval > 255 ) {
                         m_sampledepth = CV_16U;
                     }
@@ -451,7 +451,7 @@ bool  PAMDecoder::readHeader()
                     }
                     break;
                 default:
-                    CV_THROW( RBS_BAD_HEADER );
+                    throw RBS_BAD_HEADER;
             }
         } while (fieldtype != PAM_HEADER_ENDHDR);
 
@@ -469,7 +469,7 @@ bool  PAMDecoder::readHeader()
 
             return true;
         }
-    } CV_CATCH_ALL
+    } catch(...)
     {
     }
 
@@ -512,7 +512,7 @@ bool  PAMDecoder::readData( Mat& img )
         }
     }
 
-    CV_TRY
+    try
     {
         m_strm.setPos( m_offset );
 
@@ -610,7 +610,7 @@ bool  PAMDecoder::readData( Mat& img )
         }
 
         res = true;
-    } CV_CATCH_ALL
+    } catch(...)
     {
     }
 
diff --git a/modules/imgcodecs/src/grfmt_pxm.cpp b/modules/imgcodecs/src/grfmt_pxm.cpp
index 1289ee45cb..055198ced3 100644
--- a/modules/imgcodecs/src/grfmt_pxm.cpp
+++ b/modules/imgcodecs/src/grfmt_pxm.cpp
@@ -150,11 +150,11 @@ bool PxMDecoder::readHeader()
     else if( !m_strm.open( m_filename ))
         return false;
 
-    CV_TRY
+    try
     {
         int code = m_strm.getByte();
         if( code != 'P' )
-            CV_THROW (RBS_BAD_HEADER);
+            throw RBS_BAD_HEADER;
 
         code = m_strm.getByte();
         switch( code )
@@ -162,7 +162,7 @@ bool PxMDecoder::readHeader()
         case '1': case '4': m_bpp = 1; break;
         case '2': case '5': m_bpp = 8; break;
         case '3': case '6': m_bpp = 24; break;
-        default: CV_THROW (RBS_BAD_HEADER);
+        default: throw RBS_BAD_HEADER;
         }
 
         m_binary = code >= '4';
@@ -173,7 +173,7 @@ bool PxMDecoder::readHeader()
 
         m_maxval = m_bpp == 1 ? 1 : ReadNumber(m_strm);
         if( m_maxval > 65535 )
-            CV_THROW (RBS_BAD_HEADER);
+            throw RBS_BAD_HEADER;
 
         //if( m_maxval > 255 ) m_binary = false; nonsense
         if( m_maxval > 255 )
@@ -185,15 +185,14 @@ bool PxMDecoder::readHeader()
             result = true;
         }
     }
-    CV_CATCH (cv::Exception, e)
+    catch (const cv::Exception&)
     {
-        CV_UNUSED(e);
-        CV_RETHROW();
+        throw;
     }
-    CV_CATCH_ALL
+    catch (...)
     {
         std::cerr << "PXM::readHeader(): unknown C++ exception" << std::endl << std::flush;
-        CV_RETHROW();
+        throw;
     }
 
     if( !result )
@@ -233,7 +232,7 @@ bool PxMDecoder::readData( Mat& img )
         FillGrayPalette( palette, m_bpp==1 ? 1 : 8 , m_bpp == 1 );
     }
 
-    CV_TRY
+    try
     {
         m_strm.setPos( m_offset );
 
@@ -359,15 +358,14 @@ bool PxMDecoder::readData( Mat& img )
             CV_Error(Error::StsError, "m_bpp is not supported");
         }
     }
-    CV_CATCH (cv::Exception, e)
+    catch (const cv::Exception&)
     {
-        CV_UNUSED(e);
-        CV_RETHROW();
+        throw;
     }
-    CV_CATCH_ALL
+    catch (...)
     {
         std::cerr << "PXM::readData(): unknown exception" << std::endl << std::flush;
-        CV_RETHROW();
+        throw;
     }
 
     return result;
diff --git a/modules/imgcodecs/src/grfmt_sunras.cpp b/modules/imgcodecs/src/grfmt_sunras.cpp
index 2d342be82d..a59cc47255 100644
--- a/modules/imgcodecs/src/grfmt_sunras.cpp
+++ b/modules/imgcodecs/src/grfmt_sunras.cpp
@@ -84,7 +84,7 @@ bool  SunRasterDecoder::readHeader()
 
     if( !m_strm.open( m_filename )) return false;
 
-    CV_TRY
+    try
     {
         m_strm.skip( 4 );
         m_width  = m_strm.getDWord();
@@ -144,7 +144,7 @@ bool  SunRasterDecoder::readHeader()
             }
         }
     }
-    CV_CATCH_ALL
+    catch(...)
     {
     }
 
@@ -179,7 +179,7 @@ bool  SunRasterDecoder::readData( Mat& img )
     if( !color && m_maptype == RMT_EQUAL_RGB )
         CvtPaletteToGray( m_palette, gray_palette, 1 << m_bpp );
 
-    CV_TRY
+    try
     {
         m_strm.setPos( m_offset );
 
@@ -376,7 +376,7 @@ bad_decoding_end:
             CV_Error(Error::StsInternal, "");
         }
     }
-    CV_CATCH_ALL
+    catch( ... )
     {
     }
 
diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp
index 24ff0489f0..655db016b1 100644
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@@ -433,18 +433,18 @@ imread_( const String& filename, int flags, Mat& mat )
     /// set the filename in the driver
     decoder->setSource( filename );
 
-    CV_TRY
+    try
     {
         // read the header to make sure it succeeds
         if( !decoder->readHeader() )
             return 0;
     }
-    CV_CATCH (cv::Exception, e)
+    catch (const cv::Exception& e)
     {
         std::cerr << "imread_('" << filename << "'): can't read header: " << e.what() << std::endl << std::flush;
         return 0;
     }
-    CV_CATCH_ALL
+    catch (...)
     {
         std::cerr << "imread_('" << filename << "'): can't read header: unknown exception" << std::endl << std::flush;
         return 0;
@@ -472,16 +472,16 @@ imread_( const String& filename, int flags, Mat& mat )
 
     // read the image data
     bool success = false;
-    CV_TRY
+    try
     {
         if (decoder->readData(mat))
             success = true;
     }
-    CV_CATCH (cv::Exception, e)
+    catch (const cv::Exception& e)
     {
         std::cerr << "imread_('" << filename << "'): can't read data: " << e.what() << std::endl << std::flush;
     }
-    CV_CATCH_ALL
+    catch (...)
     {
         std::cerr << "imread_('" << filename << "'): can't read data: unknown exception" << std::endl << std::flush;
     }
@@ -534,18 +534,18 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
     decoder->setSource(filename);
 
     // read the header to make sure it succeeds
-    CV_TRY
+    try
     {
         // read the header to make sure it succeeds
         if( !decoder->readHeader() )
             return 0;
     }
-    CV_CATCH (cv::Exception, e)
+    catch (const cv::Exception& e)
     {
         std::cerr << "imreadmulti_('" << filename << "'): can't read header: " << e.what() << std::endl << std::flush;
         return 0;
     }
-    CV_CATCH_ALL
+    catch (...)
     {
         std::cerr << "imreadmulti_('" << filename << "'): can't read header: unknown exception" << std::endl << std::flush;
         return 0;
@@ -573,16 +573,16 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
         // read the image data
         Mat mat(size.height, size.width, type);
         bool success = false;
-        CV_TRY
+        try
         {
             if (decoder->readData(mat))
                 success = true;
         }
-        CV_CATCH (cv::Exception, e)
+        catch (const cv::Exception& e)
         {
             std::cerr << "imreadmulti_('" << filename << "'): can't read data: " << e.what() << std::endl << std::flush;
         }
-        CV_CATCH_ALL
+        catch (...)
         {
             std::cerr << "imreadmulti_('" << filename << "'): can't read data: unknown exception" << std::endl << std::flush;
         }
@@ -749,16 +749,16 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
     }
 
     bool success = false;
-    CV_TRY
+    try
     {
         if (decoder->readHeader())
             success = true;
     }
-    CV_CATCH (cv::Exception, e)
+    catch (const cv::Exception& e)
     {
         std::cerr << "imdecode_('" << filename << "'): can't read header: " << e.what() << std::endl << std::flush;
     }
-    CV_CATCH_ALL
+    catch (...)
     {
         std::cerr << "imdecode_('" << filename << "'): can't read header: unknown exception" << std::endl << std::flush;
     }
@@ -794,16 +794,16 @@ imdecode_( const Mat& buf, int flags, Mat& mat )
     mat.create( size.height, size.width, type );
 
     success = false;
-    CV_TRY
+    try
     {
         if (decoder->readData(mat))
             success = true;
     }
-    CV_CATCH (cv::Exception, e)
+    catch (const cv::Exception& e)
     {
         std::cerr << "imdecode_('" << filename << "'): can't read data: " << e.what() << std::endl << std::flush;
     }
-    CV_CATCH_ALL
+    catch (...)
     {
         std::cerr << "imdecode_('" << filename << "'): can't read data: unknown exception" << std::endl << std::flush;
     }
diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp
index 39e942e8cd..fbd5223c13 100644
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -291,11 +291,11 @@ static bool openvx_accumulate(InputArray _src, InputOutputArray _dst, InputArray
         srcImage.swapHandle(); dstImage.swapHandle();
 #endif
     }
-    catch (ivx::RuntimeError & e)
+    catch (const ivx::RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (ivx::WrapperError & e)
+    catch (const ivx::WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/imgproc/src/box_filter.cpp b/modules/imgproc/src/box_filter.cpp
new file mode 100644
index 0000000000..e4405b4b69
--- /dev/null
+++ b/modules/imgproc/src/box_filter.cpp
@@ -0,0 +1,1806 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#include <vector>
+
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencl_kernels_imgproc.hpp"
+
+#include "opencv2/core/openvx/ovx_defs.hpp"
+
+namespace cv
+{
+
+/****************************************************************************************\
+                                         Box Filter
+\****************************************************************************************/
+
+template<typename T, typename ST>
+struct RowSum :
+        public BaseRowFilter
+{
+    RowSum( int _ksize, int _anchor ) :
+        BaseRowFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+    }
+
+    virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
+    {
+        const T* S = (const T*)src;
+        ST* D = (ST*)dst;
+        int i = 0, k, ksz_cn = ksize*cn;
+
+        width = (width - 1)*cn;
+        if( ksize == 3 )
+        {
+            for( i = 0; i < width + cn; i++ )
+            {
+                D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2];
+            }
+        }
+        else if( ksize == 5 )
+        {
+            for( i = 0; i < width + cn; i++ )
+            {
+                D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4];
+            }
+        }
+        else if( cn == 1 )
+        {
+            ST s = 0;
+            for( i = 0; i < ksz_cn; i++ )
+                s += (ST)S[i];
+            D[0] = s;
+            for( i = 0; i < width; i++ )
+            {
+                s += (ST)S[i + ksz_cn] - (ST)S[i];
+                D[i+1] = s;
+            }
+        }
+        else if( cn == 3 )
+        {
+            ST s0 = 0, s1 = 0, s2 = 0;
+            for( i = 0; i < ksz_cn; i += 3 )
+            {
+                s0 += (ST)S[i];
+                s1 += (ST)S[i+1];
+                s2 += (ST)S[i+2];
+            }
+            D[0] = s0;
+            D[1] = s1;
+            D[2] = s2;
+            for( i = 0; i < width; i += 3 )
+            {
+                s0 += (ST)S[i + ksz_cn] - (ST)S[i];
+                s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
+                s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
+                D[i+3] = s0;
+                D[i+4] = s1;
+                D[i+5] = s2;
+            }
+        }
+        else if( cn == 4 )
+        {
+            ST s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+            for( i = 0; i < ksz_cn; i += 4 )
+            {
+                s0 += (ST)S[i];
+                s1 += (ST)S[i+1];
+                s2 += (ST)S[i+2];
+                s3 += (ST)S[i+3];
+            }
+            D[0] = s0;
+            D[1] = s1;
+            D[2] = s2;
+            D[3] = s3;
+            for( i = 0; i < width; i += 4 )
+            {
+                s0 += (ST)S[i + ksz_cn] - (ST)S[i];
+                s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
+                s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
+                s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3];
+                D[i+4] = s0;
+                D[i+5] = s1;
+                D[i+6] = s2;
+                D[i+7] = s3;
+            }
+        }
+        else
+            for( k = 0; k < cn; k++, S++, D++ )
+            {
+                ST s = 0;
+                for( i = 0; i < ksz_cn; i += cn )
+                    s += (ST)S[i];
+                D[0] = s;
+                for( i = 0; i < width; i += cn )
+                {
+                    s += (ST)S[i + ksz_cn] - (ST)S[i];
+                    D[i+cn] = s;
+                }
+            }
+    }
+};
+
+
+template<typename ST, typename T>
+struct ColumnSum :
+        public BaseColumnFilter
+{
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+    }
+
+    virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+    {
+        int i;
+        ST* SUM;
+        bool haveScale = scale != 1;
+        double _scale = scale;
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void*)SUM, 0, width*sizeof(ST));
+
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const ST* Sp = (const ST*)src[0];
+
+                for( i = 0; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const ST* Sp = (const ST*)src[0];
+            const ST* Sm = (const ST*)src[1-ksize];
+            T* D = (T*)dst;
+            if( haveScale )
+            {
+                for( i = 0; i <= width - 2; i += 2 )
+                {
+                    ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
+                    D[i] = saturate_cast<T>(s0*_scale);
+                    D[i+1] = saturate_cast<T>(s1*_scale);
+                    s0 -= Sm[i]; s1 -= Sm[i+1];
+                    SUM[i] = s0; SUM[i+1] = s1;
+                }
+
+                for( ; i < width; i++ )
+                {
+                    ST s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<T>(s0*_scale);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            else
+            {
+                for( i = 0; i <= width - 2; i += 2 )
+                {
+                    ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
+                    D[i] = saturate_cast<T>(s0);
+                    D[i+1] = saturate_cast<T>(s1);
+                    s0 -= Sm[i]; s1 -= Sm[i+1];
+                    SUM[i] = s0; SUM[i+1] = s1;
+                }
+
+                for( ; i < width; i++ )
+                {
+                    ST s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<T>(s0);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            dst += dststep;
+        }
+    }
+
+    double scale;
+    int sumCount;
+    std::vector<ST> sum;
+};
+
+
+template<>
+struct ColumnSum<int, uchar> :
+        public BaseColumnFilter
+{
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+    }
+
+    virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+    {
+        int* SUM;
+        bool haveScale = scale != 1;
+        double _scale = scale;
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void*)SUM, 0, width*sizeof(int));
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const int* Sp = (const int*)src[0];
+                int i = 0;
+#if CV_SIMD
+                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+                {
+                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+                {
+                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const int* Sp = (const int*)src[0];
+            const int* Sm = (const int*)src[1-ksize];
+            uchar* D = (uchar*)dst;
+            if( haveScale )
+            {
+                int i = 0;
+#if CV_SIMD
+                v_float32 _v_scale = vx_setall_f32((float)_scale);
+                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
+                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+
+                    v_uint16 v_dst = v_pack(v_s0d, v_s01d);
+                    v_pack_store(D + i, v_dst);
+
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                }
+#if CV_SIMD_WIDTH > 16
+                v_float32x4 v_scale = v_setall_f32((float)_scale);
+                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
+                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+
+                    v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
+                    v_pack_store(D + i, v_dst);
+
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+            }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<uchar>(s0*_scale);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            else
+            {
+                int i = 0;
+#if CV_SIMD
+                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+                    v_uint16 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
+                    v_pack_store(D + i, v_dst);
+
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+                    v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
+                    v_pack_store(D + i, v_dst);
+
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<uchar>(s0);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            dst += dststep;
+        }
+#if CV_SIMD
+        vx_cleanup();
+#endif
+    }
+
+    double scale;
+    int sumCount;
+    std::vector<int> sum;
+};
+
+
+template<>
+struct ColumnSum<ushort, uchar> :
+public BaseColumnFilter
+{
+    enum { SHIFT = 23 };
+
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+    BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+        divDelta = 0;
+        divScale = 1;
+        if( scale != 1 )
+        {
+            int d = cvRound(1./scale);
+            double scalef = ((double)(1 << SHIFT))/d;
+            divScale = cvFloor(scalef);
+            scalef -= divScale;
+            divDelta = d/2;
+            if( scalef < 0.5 )
+                divDelta++;
+            else
+                divScale++;
+        }
+    }
+
+    virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+    {
+        const int ds = divScale;
+        const int dd = divDelta;
+        ushort* SUM;
+        const bool haveScale = scale != 1;
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void*)SUM, 0, width*sizeof(SUM[0]));
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const ushort* Sp = (const ushort*)src[0];
+                int i = 0;
+#if CV_SIMD
+                for( ; i <= width - v_uint16::nlanes; i += v_uint16::nlanes )
+                {
+                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width - v_uint16x8::nlanes; i += v_uint16x8::nlanes )
+                {
+                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const ushort* Sp = (const ushort*)src[0];
+            const ushort* Sm = (const ushort*)src[1-ksize];
+            uchar* D = (uchar*)dst;
+            if( haveScale )
+            {
+                int i = 0;
+#if CV_SIMD
+                v_uint32 _ds4 = vx_setall_u32((unsigned)ds);
+                v_uint16 _dd8 = vx_setall_u16((ushort)dd);
+
+                for( ; i <= width-v_uint8::nlanes; i+=v_uint8::nlanes )
+                {
+                    v_uint16 _sm0 = vx_load(Sm + i);
+                    v_uint16 _sm1 = vx_load(Sm + i + v_uint16::nlanes);
+
+                    v_uint16 _s0 = v_add_wrap(vx_load(SUM + i), vx_load(Sp + i));
+                    v_uint16 _s1 = v_add_wrap(vx_load(SUM + i + v_uint16::nlanes), vx_load(Sp + i + v_uint16::nlanes));
+
+                    v_uint32 _s00, _s01, _s10, _s11;
+
+                    v_expand(_s0 + _dd8, _s00, _s01);
+                    v_expand(_s1 + _dd8, _s10, _s11);
+
+                    _s00 = v_shr<SHIFT>(_s00*_ds4);
+                    _s01 = v_shr<SHIFT>(_s01*_ds4);
+                    _s10 = v_shr<SHIFT>(_s10*_ds4);
+                    _s11 = v_shr<SHIFT>(_s11*_ds4);
+
+                    v_int16 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
+                    v_int16 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
+
+                    _s0 = v_sub_wrap(_s0, _sm0);
+                    _s1 = v_sub_wrap(_s1, _sm1);
+
+                    v_store(D + i, v_pack_u(r0, r1));
+                    v_store(SUM + i, _s0);
+                    v_store(SUM + i + v_uint16::nlanes, _s1);
+                }
+#if CV_SIMD_WIDTH > 16
+                v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
+                v_uint16x8 dd8 = v_setall_u16((ushort)dd);
+
+                for( ; i <= width-v_uint8x16::nlanes; i+=v_uint8x16::nlanes )
+                {
+                    v_uint16x8 _sm0 = v_load(Sm + i);
+                    v_uint16x8 _sm1 = v_load(Sm + i + v_uint16x8::nlanes);
+
+                    v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
+                    v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + v_uint16x8::nlanes), v_load(Sp + i + v_uint16x8::nlanes));
+
+                    v_uint32x4 _s00, _s01, _s10, _s11;
+
+                    v_expand(_s0 + dd8, _s00, _s01);
+                    v_expand(_s1 + dd8, _s10, _s11);
+
+                    _s00 = v_shr<SHIFT>(_s00*ds4);
+                    _s01 = v_shr<SHIFT>(_s01*ds4);
+                    _s10 = v_shr<SHIFT>(_s10*ds4);
+                    _s11 = v_shr<SHIFT>(_s11*ds4);
+
+                    v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
+                    v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
+
+                    _s0 = v_sub_wrap(_s0, _sm0);
+                    _s1 = v_sub_wrap(_s1, _sm1);
+
+                    v_store(D + i, v_pack_u(r0, r1));
+                    v_store(SUM + i, _s0);
+                    v_store(SUM + i + v_uint16x8::nlanes, _s1);
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = (uchar)((s0 + dd)*ds >> SHIFT);
+                    SUM[i] = (ushort)(s0 - Sm[i]);
+                }
+            }
+            else
+            {
+                int i = 0;
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<uchar>(s0);
+                    SUM[i] = (ushort)(s0 - Sm[i]);
+                }
+            }
+            dst += dststep;
+        }
+#if CV_SIMD
+        vx_cleanup();
+#endif
+    }
+
+    double scale;
+    int sumCount;
+    int divDelta;
+    int divScale;
+    std::vector<ushort> sum;
+};
+
+
+template<>
+struct ColumnSum<int, short> :
+        public BaseColumnFilter
+{
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+    }
+
+    virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+    {
+        int i;
+        int* SUM;
+        bool haveScale = scale != 1;
+        double _scale = scale;
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void*)SUM, 0, width*sizeof(int));
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const int* Sp = (const int*)src[0];
+                i = 0;
+#if CV_SIMD
+                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+                {
+                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                {
+                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const int* Sp = (const int*)src[0];
+            const int* Sm = (const int*)src[1-ksize];
+            short* D = (short*)dst;
+            if( haveScale )
+            {
+                i = 0;
+#if CV_SIMD
+                v_float32 _v_scale = vx_setall_f32((float)_scale);
+                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+                    v_int32 v_s0d =  v_round(v_cvt_f32(v_s0) * _v_scale);
+                    v_int32 v_s01d = v_round(v_cvt_f32(v_s01) * _v_scale);
+                    v_store(D + i, v_pack(v_s0d, v_s01d));
+
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                }
+#if CV_SIMD_WIDTH > 16
+                v_float32x4 v_scale = v_setall_f32((float)_scale);
+                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+                    v_int32x4 v_s0d =  v_round(v_cvt_f32(v_s0) * v_scale);
+                    v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
+                    v_store(D + i, v_pack(v_s0d, v_s01d));
+
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<short>(s0*_scale);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            else
+            {
+                i = 0;
+#if CV_SIMD
+                for( ; i <= width-v_int16::nlanes; i+=v_int16::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+                    v_store(D + i, v_pack(v_s0, v_s01));
+
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width-v_int16x8::nlanes; i+=v_int16x8::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+                    v_store(D + i, v_pack(v_s0, v_s01));
+
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                }
+#endif
+#endif
+
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<short>(s0);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            dst += dststep;
+        }
+#if CV_SIMD
+        vx_cleanup();
+#endif
+    }
+
+    double scale;
+    int sumCount;
+    std::vector<int> sum;
+};
+
+
+template<>
+struct ColumnSum<int, ushort> :
+        public BaseColumnFilter
+{
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+    }
+
+    virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+    {
+        int* SUM;
+        bool haveScale = scale != 1;
+        double _scale = scale;
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void*)SUM, 0, width*sizeof(int));
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const int* Sp = (const int*)src[0];
+                int i = 0;
+#if CV_SIMD
+                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+                {
+                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+                {
+                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const int* Sp = (const int*)src[0];
+            const int* Sm = (const int*)src[1-ksize];
+            ushort* D = (ushort*)dst;
+            if( haveScale )
+            {
+                int i = 0;
+#if CV_SIMD
+                v_float32 _v_scale = vx_setall_f32((float)_scale);
+                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+                    v_uint32 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * _v_scale));
+                    v_uint32 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * _v_scale));
+                    v_store(D + i, v_pack(v_s0d, v_s01d));
+
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                }
+#if CV_SIMD_WIDTH > 16
+                v_float32x4 v_scale = v_setall_f32((float)_scale);
+                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+                    v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
+                    v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
+                    v_store(D + i, v_pack(v_s0d, v_s01d));
+
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<ushort>(s0*_scale);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            else
+            {
+                int i = 0;
+#if CV_SIMD
+                for( ; i <= width-v_uint16::nlanes; i+=v_uint16::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s01 = vx_load(SUM + i + v_int32::nlanes) + vx_load(Sp + i + v_int32::nlanes);
+
+                    v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
+
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                    v_store(SUM + i + v_int32::nlanes, v_s01 - vx_load(Sm + i + v_int32::nlanes));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width-v_uint16x8::nlanes; i+=v_uint16x8::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s01 = v_load(SUM + i + v_int32x4::nlanes) + v_load(Sp + i + v_int32x4::nlanes);
+
+                    v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
+
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                    v_store(SUM + i + v_int32x4::nlanes, v_s01 - v_load(Sm + i + v_int32x4::nlanes));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<ushort>(s0);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            dst += dststep;
+        }
+#if CV_SIMD
+        vx_cleanup();
+#endif
+    }
+
+    double scale;
+    int sumCount;
+    std::vector<int> sum;
+};
+
+template<>
+struct ColumnSum<int, int> :
+        public BaseColumnFilter
+{
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+    }
+
+    virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+    {
+        int* SUM;
+        bool haveScale = scale != 1;
+        double _scale = scale;
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void*)SUM, 0, width*sizeof(int));
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const int* Sp = (const int*)src[0];
+                int i = 0;
+#if CV_SIMD
+                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+                {
+                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                {
+                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const int* Sp = (const int*)src[0];
+            const int* Sm = (const int*)src[1-ksize];
+            int* D = (int*)dst;
+            if( haveScale )
+            {
+                int i = 0;
+#if CV_SIMD
+                v_float32 _v_scale = vx_setall_f32((float)_scale);
+                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_int32 v_s0d = v_round(v_cvt_f32(v_s0) * _v_scale);
+
+                    v_store(D + i, v_s0d);
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                v_float32x4 v_scale = v_setall_f32((float)_scale);
+                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
+
+                    v_store(D + i, v_s0d);
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = saturate_cast<int>(s0*_scale);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            else
+            {
+                int i = 0;
+#if CV_SIMD
+                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+
+                    v_store(D + i, v_s0);
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+
+                    v_store(D + i, v_s0);
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = s0;
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            dst += dststep;
+        }
+#if CV_SIMD
+        vx_cleanup();
+#endif
+    }
+
+    double scale;
+    int sumCount;
+    std::vector<int> sum;
+};
+
+
+template<>
+struct ColumnSum<int, float> :
+        public BaseColumnFilter
+{
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+    }
+
+    virtual void reset() CV_OVERRIDE { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
+    {
+        int* SUM;
+        bool haveScale = scale != 1;
+        double _scale = scale;
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void*)SUM, 0, width*sizeof(int));
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const int* Sp = (const int*)src[0];
+                int i = 0;
+#if CV_SIMD
+                for( ; i <= width - v_int32::nlanes; i+=v_int32::nlanes )
+                {
+                    v_store(SUM + i, vx_load(SUM + i) + vx_load(Sp + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width - v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                {
+                    v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
+                }
+#endif
+#endif
+
+                for( ; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const int * Sp = (const int*)src[0];
+            const int * Sm = (const int*)src[1-ksize];
+            float* D = (float*)dst;
+            if( haveScale )
+            {
+                int i = 0;
+
+#if CV_SIMD
+                v_float32 _v_scale = vx_setall_f32((float)_scale);
+                for (; i <= width - v_int32::nlanes; i += v_int32::nlanes)
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_store(D + i, v_cvt_f32(v_s0) * _v_scale);
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                v_float32x4 v_scale = v_setall_f32((float)_scale);
+                for (; i <= width - v_int32x4::nlanes; i += v_int32x4::nlanes)
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_store(D + i, v_cvt_f32(v_s0) * v_scale);
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = (float)(s0*_scale);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            else
+            {
+                int i = 0;
+
+#if CV_SIMD
+                for( ; i <= width-v_int32::nlanes; i+=v_int32::nlanes )
+                {
+                    v_int32 v_s0 = vx_load(SUM + i) + vx_load(Sp + i);
+                    v_store(D + i, v_cvt_f32(v_s0));
+                    v_store(SUM + i, v_s0 - vx_load(Sm + i));
+                }
+#if CV_SIMD_WIDTH > 16
+                for( ; i <= width-v_int32x4::nlanes; i+=v_int32x4::nlanes )
+                {
+                    v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
+                    v_store(D + i, v_cvt_f32(v_s0));
+                    v_store(SUM + i, v_s0 - v_load(Sm + i));
+                }
+#endif
+#endif
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = (float)(s0);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            dst += dststep;
+        }
+#if CV_SIMD
+        vx_cleanup();
+#endif
+    }
+
+    double scale;
+    int sumCount;
+    std::vector<int> sum;
+};
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth,
+                                   Size ksize, Point anchor, int borderType, bool normalize )
+{
+    const ocl::Device & dev = ocl::Device::getDefault();
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    if (ddepth < 0)
+        ddepth = sdepth;
+
+    if (anchor.x < 0)
+        anchor.x = ksize.width / 2;
+    if (anchor.y < 0)
+        anchor.y = ksize.height / 2;
+
+    if ( !(dev.isIntel() && (type == CV_8UC1) &&
+         (_src.offset() == 0) && (_src.step() % 4 == 0) &&
+         (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) &&
+         (anchor.x == 1) && (anchor.y == 1) &&
+         (ksize.width == 3) && (ksize.height == 3)) )
+        return false;
+
+    float alpha = 1.0f / (ksize.height * ksize.width);
+    Size size = _src.size();
+    size_t globalsize[2] = { 0, 0 };
+    size_t localsize[2] = { 0, 0 };
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
+
+    globalsize[0] = size.width / 16;
+    globalsize[1] = size.height / 2;
+
+    char build_opts[1024];
+    sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : "");
+
+    ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts);
+    if (kernel.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+    _dst.create(size, CV_MAKETYPE(ddepth, cn));
+    if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
+        return false;
+    UMat dst = _dst.getUMat();
+
+    int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
+    idxArg = kernel.set(idxArg, (int)src.step);
+    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
+    idxArg = kernel.set(idxArg, (int)dst.step);
+    idxArg = kernel.set(idxArg, (int)dst.rows);
+    idxArg = kernel.set(idxArg, (int)dst.cols);
+    if (normalize)
+        idxArg = kernel.set(idxArg, (float)alpha);
+
+    return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
+}
+
+static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
+                           Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
+{
+    const ocl::Device & dev = ocl::Device::getDefault();
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
+    bool doubleSupport = dev.doubleFPConfig() > 0;
+
+    if (ddepth < 0)
+        ddepth = sdepth;
+
+    if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
+        _src.offset() % esz != 0 || _src.step() % esz != 0)
+        return false;
+
+    if (anchor.x < 0)
+        anchor.x = ksize.width / 2;
+    if (anchor.y < 0)
+        anchor.y = ksize.height / 2;
+
+    int computeUnits = ocl::Device::getDefault().maxComputeUnits();
+    float alpha = 1.0f / (ksize.height * ksize.width);
+    Size size = _src.size(), wholeSize;
+    bool isolated = (borderType & BORDER_ISOLATED) != 0;
+    borderType &= ~BORDER_ISOLATED;
+    int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
+        wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
+
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
+    size_t globalsize[2] = { (size_t)size.width, (size_t)size.height };
+    size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
+
+    UMat src = _src.getUMat();
+    if (!isolated)
+    {
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+    }
+
+    int h = isolated ? size.height : wholeSize.height;
+    int w = isolated ? size.width : wholeSize.width;
+
+    size_t maxWorkItemSizes[32];
+    ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
+    int tryWorkItems = (int)maxWorkItemSizes[0];
+
+    ocl::Kernel kernel;
+
+    if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
+        ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
+         (ksize.width == 5 && ksize.height == 5 && cn == 1)))
+    {
+        if (w < ksize.width || h < ksize.height)
+            return false;
+
+        // Figure out what vector size to use for loading the pixels.
+        int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
+        int pxLoadVecSize = cn * pxLoadNumPixels;
+
+        // Figure out how many pixels per work item to compute in X and Y
+        // directions.  Too many and we run out of registers.
+        int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
+        if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
+        {
+            pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
+            pxPerWorkItemY = size.height % 2 ? 1 : 2;
+        }
+        else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
+        {
+            pxPerWorkItemX = size.width % 2 ? 1 : 2;
+            pxPerWorkItemY = size.height % 2 ? 1 : 2;
+        }
+        globalsize[0] = size.width / pxPerWorkItemX;
+        globalsize[1] = size.height / pxPerWorkItemY;
+
+        // Need some padding in the private array for pixels
+        int privDataWidth = roundUp(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
+
+        // Make the global size a nice round number so the runtime can pick
+        // from reasonable choices for the workgroup size
+        const int wgRound = 256;
+        globalsize[0] = roundUp(globalsize[0], wgRound);
+
+        char build_options[1024], cvt[2][40];
+        sprintf(build_options, "-D cn=%d "
+                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
+                "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
+                "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
+                "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
+                "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
+                "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
+                cn, anchor.x, anchor.y, ksize.width, ksize.height,
+                pxLoadVecSize, pxLoadNumPixels,
+                pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
+                isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
+                privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
+                ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
+                ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
+                ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
+                ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
+                normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
+                ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
+                );
+
+
+        if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
+            return false;
+    }
+    else
+    {
+        localsize = localsize_general;
+        for ( ; ; )
+        {
+            int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
+
+            while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
+                BLOCK_SIZE_X /= 2;
+            while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
+                BLOCK_SIZE_Y *= 2;
+
+            if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
+                return false;
+
+            char cvt[2][50];
+            String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
+                                 " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
+                                 " -D ST1=%s -D DT1=%s -D cn=%d",
+                                 BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
+                                 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
+                                 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
+                                 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
+                                 anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
+                                 isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                                 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
+                                 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
+
+            localsize[0] = BLOCK_SIZE_X;
+            globalsize[0] = divUp(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
+            globalsize[1] = divUp(size.height, BLOCK_SIZE_Y);
+
+            kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
+            if (kernel.empty())
+                return false;
+
+            size_t kernelWorkGroupSize = kernel.workGroupSize();
+            if (localsize[0] <= kernelWorkGroupSize)
+                break;
+            if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
+                return false;
+
+            tryWorkItems = (int)kernelWorkGroupSize;
+        }
+    }
+
+    _dst.create(size, CV_MAKETYPE(ddepth, cn));
+    UMat dst = _dst.getUMat();
+
+    int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
+    idxArg = kernel.set(idxArg, (int)src.step);
+    int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
+    int srcOffsetY = (int)(src.offset / src.step);
+    int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
+    int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
+    idxArg = kernel.set(idxArg, srcOffsetX);
+    idxArg = kernel.set(idxArg, srcOffsetY);
+    idxArg = kernel.set(idxArg, srcEndX);
+    idxArg = kernel.set(idxArg, srcEndY);
+    idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
+    if (normalize)
+        idxArg = kernel.set(idxArg, (float)alpha);
+
+    return kernel.run(2, globalsize, localsize, false);
+}
+
+#endif
+
+}
+
+
+cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
+{
+    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
+    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
+
+    if( anchor < 0 )
+        anchor = ksize/2;
+
+    if( sdepth == CV_8U && ddepth == CV_32S )
+        return makePtr<RowSum<uchar, int> >(ksize, anchor);
+    if( sdepth == CV_8U && ddepth == CV_16U )
+        return makePtr<RowSum<uchar, ushort> >(ksize, anchor);
+    if( sdepth == CV_8U && ddepth == CV_64F )
+        return makePtr<RowSum<uchar, double> >(ksize, anchor);
+    if( sdepth == CV_16U && ddepth == CV_32S )
+        return makePtr<RowSum<ushort, int> >(ksize, anchor);
+    if( sdepth == CV_16U && ddepth == CV_64F )
+        return makePtr<RowSum<ushort, double> >(ksize, anchor);
+    if( sdepth == CV_16S && ddepth == CV_32S )
+        return makePtr<RowSum<short, int> >(ksize, anchor);
+    if( sdepth == CV_32S && ddepth == CV_32S )
+        return makePtr<RowSum<int, int> >(ksize, anchor);
+    if( sdepth == CV_16S && ddepth == CV_64F )
+        return makePtr<RowSum<short, double> >(ksize, anchor);
+    if( sdepth == CV_32F && ddepth == CV_64F )
+        return makePtr<RowSum<float, double> >(ksize, anchor);
+    if( sdepth == CV_64F && ddepth == CV_64F )
+        return makePtr<RowSum<double, double> >(ksize, anchor);
+
+    CV_Error_( CV_StsNotImplemented,
+        ("Unsupported combination of source format (=%d), and buffer format (=%d)",
+        srcType, sumType));
+}
+
+
+cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
+                                                     int anchor, double scale)
+{
+    int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
+    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
+
+    if( anchor < 0 )
+        anchor = ksize/2;
+
+    if( ddepth == CV_8U && sdepth == CV_32S )
+        return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
+    if( ddepth == CV_8U && sdepth == CV_16U )
+        return makePtr<ColumnSum<ushort, uchar> >(ksize, anchor, scale);
+    if( ddepth == CV_8U && sdepth == CV_64F )
+        return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
+    if( ddepth == CV_16U && sdepth == CV_32S )
+        return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
+    if( ddepth == CV_16U && sdepth == CV_64F )
+        return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
+    if( ddepth == CV_16S && sdepth == CV_32S )
+        return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
+    if( ddepth == CV_16S && sdepth == CV_64F )
+        return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
+    if( ddepth == CV_32S && sdepth == CV_32S )
+        return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
+    if( ddepth == CV_32F && sdepth == CV_32S )
+        return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
+    if( ddepth == CV_32F && sdepth == CV_64F )
+        return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
+    if( ddepth == CV_64F && sdepth == CV_32S )
+        return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
+    if( ddepth == CV_64F && sdepth == CV_64F )
+        return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
+
+    CV_Error_( CV_StsNotImplemented,
+        ("Unsupported combination of sum format (=%d), and destination format (=%d)",
+        sumType, dstType));
+}
+
+
+cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
+                    Point anchor, bool normalize, int borderType )
+{
+    int sdepth = CV_MAT_DEPTH(srcType);
+    int cn = CV_MAT_CN(srcType), sumType = CV_64F;
+    if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U &&
+        ksize.width*ksize.height <= 256 )
+        sumType = CV_16U;
+    else if( sdepth <= CV_32S && (!normalize ||
+        ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
+            sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
+        sumType = CV_32S;
+    sumType = CV_MAKETYPE( sumType, cn );
+
+    Ptr<BaseRowFilter> rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
+    Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
+        dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
+
+    return makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
+           srcType, dstType, sumType, borderType );
+}
+
+#ifdef HAVE_OPENVX
+namespace cv
+{
+    namespace ovx {
+        template <> inline bool skipSmallImages<VX_KERNEL_BOX_3x3>(int w, int h) { return w*h < 640 * 480; }
+    }
+    static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth,
+                                 Size ksize, Point anchor,
+                                 bool normalize, int borderType)
+    {
+        if (ddepth < 0)
+            ddepth = CV_8UC1;
+        if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize ||
+            _src.cols() < 3 || _src.rows() < 3 ||
+            ksize.width != 3 || ksize.height != 3 ||
+            (anchor.x >= 0 && anchor.x != 1) ||
+            (anchor.y >= 0 && anchor.y != 1) ||
+            ovx::skipSmallImages<VX_KERNEL_BOX_3x3>(_src.cols(), _src.rows()))
+            return false;
+
+        Mat src = _src.getMat();
+
+        if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix())
+            return false; //Process isolated borders only
+        vx_enum border;
+        switch (borderType & ~BORDER_ISOLATED)
+        {
+        case BORDER_CONSTANT:
+            border = VX_BORDER_CONSTANT;
+            break;
+        case BORDER_REPLICATE:
+            border = VX_BORDER_REPLICATE;
+            break;
+        default:
+            return false;
+        }
+
+        _dst.create(src.size(), CV_8UC1);
+        Mat dst = _dst.getMat();
+
+        try
+        {
+            ivx::Context ctx = ovx::getOpenVXContext();
+
+            Mat a;
+            if (dst.data != src.data)
+                a = src;
+            else
+                src.copyTo(a);
+
+            ivx::Image
+                ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                                                  ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
+                ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
+                                                  ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
+
+            //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
+            //since OpenVX standard says nothing about thread-safety for now
+            ivx::border_t prevBorder = ctx.immediateBorder();
+            ctx.setImmediateBorder(border, (vx_uint8)(0));
+            ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib));
+            ctx.setImmediateBorder(prevBorder);
+        }
+        catch (const ivx::RuntimeError & e)
+        {
+            VX_DbgThrow(e.what());
+        }
+        catch (const ivx::WrapperError & e)
+        {
+            VX_DbgThrow(e.what());
+        }
+
+        return true;
+    }
+}
+#endif
+
+#if defined(HAVE_IPP) && OPENCV_IPP_REDUCE_SIZE == 0
+namespace cv
+{
+static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType)
+{
+#ifdef HAVE_IPP_IW
+    CV_INSTRUMENT_REGION_IPP();
+
+#if IPP_VERSION_X100 < 201801
+    // Problem with SSE42 optimization for 16s and some 8u modes
+    if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5))))
+        return false;
+
+    // Other optimizations has some degradations too
+    if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5))))
+        return false;
+#endif
+
+    if(!normalize)
+        return false;
+
+    if(!ippiCheckAnchor(anchor, ksize))
+        return false;
+
+    try
+    {
+        ::ipp::IwiImage       iwSrc      = ippiGetImage(src);
+        ::ipp::IwiImage       iwDst      = ippiGetImage(dst);
+        ::ipp::IwiSize        iwKSize    = ippiGetSize(ksize);
+        ::ipp::IwiBorderSize  borderSize(iwKSize);
+        ::ipp::IwiBorderType  ippBorder(ippiGetBorder(iwSrc, borderType, borderSize));
+        if(!ippBorder)
+            return false;
+
+        CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder);
+    }
+    catch (const ::ipp::IwException &)
+    {
+        return false;
+    }
+
+    return true;
+#else
+    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType);
+    return false;
+#endif
+}
+}
+#endif
+
+
+void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
+                Size ksize, Point anchor,
+                bool normalize, int borderType )
+{
+    CV_INSTRUMENT_REGION();
+
+    CV_OCL_RUN(_dst.isUMat() &&
+               (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT ||
+                borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101),
+               ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
+
+    CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
+
+    Mat src = _src.getMat();
+    int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
+    if( ddepth < 0 )
+        ddepth = sdepth;
+    _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
+    Mat dst = _dst.getMat();
+    if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
+    {
+        if( src.rows == 1 )
+            ksize.height = 1;
+        if( src.cols == 1 )
+            ksize.width = 1;
+    }
+
+    Point ofs;
+    Size wsz(src.cols, src.rows);
+    if(!(borderType&BORDER_ISOLATED))
+        src.locateROI( wsz, ofs );
+
+    CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn,
+             ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
+             anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED);
+
+    CV_OVX_RUN(true,
+               openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType))
+
+#if OPENCV_IPP_REDUCE_SIZE == 0
+    CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType));
+#endif
+
+    borderType = (borderType&~BORDER_ISOLATED);
+
+    Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
+                        ksize, anchor, normalize, borderType );
+
+    f->apply( src, dst, wsz, ofs );
+}
+
+
+void cv::blur( InputArray src, OutputArray dst,
+           Size ksize, Point anchor, int borderType )
+{
+    CV_INSTRUMENT_REGION();
+
+    boxFilter( src, dst, -1, ksize, anchor, true, borderType );
+}
+
+
+/****************************************************************************************\
+                                    Squared Box Filter
+\****************************************************************************************/
+
+namespace cv
+{
+
+template<typename T, typename ST>
+struct SqrRowSum :
+        public BaseRowFilter
+{
+    SqrRowSum( int _ksize, int _anchor ) :
+        BaseRowFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+    }
+
+    virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
+    {
+        const T* S = (const T*)src;
+        ST* D = (ST*)dst;
+        int i = 0, k, ksz_cn = ksize*cn;
+
+        width = (width - 1)*cn;
+        for( k = 0; k < cn; k++, S++, D++ )
+        {
+            ST s = 0;
+            for( i = 0; i < ksz_cn; i += cn )
+            {
+                ST val = (ST)S[i];
+                s += val*val;
+            }
+            D[0] = s;
+            for( i = 0; i < width; i += cn )
+            {
+                ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
+                s += val1*val1 - val0*val0;
+                D[i+cn] = s;
+            }
+        }
+    }
+};
+
+static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
+{
+    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
+    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
+
+    if( anchor < 0 )
+        anchor = ksize/2;
+
+    if( sdepth == CV_8U && ddepth == CV_32S )
+        return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
+    if( sdepth == CV_8U && ddepth == CV_64F )
+        return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
+    if( sdepth == CV_16U && ddepth == CV_64F )
+        return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
+    if( sdepth == CV_16S && ddepth == CV_64F )
+        return makePtr<SqrRowSum<short, double> >(ksize, anchor);
+    if( sdepth == CV_32F && ddepth == CV_64F )
+        return makePtr<SqrRowSum<float, double> >(ksize, anchor);
+    if( sdepth == CV_64F && ddepth == CV_64F )
+        return makePtr<SqrRowSum<double, double> >(ksize, anchor);
+
+    CV_Error_( CV_StsNotImplemented,
+              ("Unsupported combination of source format (=%d), and buffer format (=%d)",
+               srcType, sumType));
+}
+
+}
+
+void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
+                       Size ksize, Point anchor,
+                       bool normalize, int borderType )
+{
+    CV_INSTRUMENT_REGION();
+
+    int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
+    Size size = _src.size();
+
+    if( ddepth < 0 )
+        ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
+
+    if( borderType != BORDER_CONSTANT && normalize )
+    {
+        if( size.height == 1 )
+            ksize.height = 1;
+        if( size.width == 1 )
+            ksize.width = 1;
+    }
+
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
+
+    int sumDepth = CV_64F;
+    if( sdepth == CV_8U )
+        sumDepth = CV_32S;
+    int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
+
+    Mat src = _src.getMat();
+    _dst.create( size, dstType );
+    Mat dst = _dst.getMat();
+
+    Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
+    Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
+                                                            dstType, ksize.height, anchor.y,
+                                                            normalize ? 1./(ksize.width*ksize.height) : 1);
+
+    Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
+                                                srcType, dstType, sumType, borderType );
+    Point ofs;
+    Size wsz(src.cols, src.rows);
+    src.locateROI( wsz, ofs );
+
+    f->apply( src, dst, wsz, ofs );
+}
+
+/* End of file. */
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index 43445bd375..84e3c0a36a 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -1821,7 +1821,7 @@ cvFindContours_Impl( void*  img,  CvMemStorage*  storage,
     }
     else
     {
-        CV_TRY
+        try
         {
             scanner = cvStartFindContours_Impl( img, storage, cntHeaderSize, mode, method, offset,
                                             needFillBorder);
@@ -1833,11 +1833,11 @@ cvFindContours_Impl( void*  img,  CvMemStorage*  storage,
             }
             while( contour != 0 );
         }
-        CV_CATCH_ALL
+        catch(...)
         {
             if( scanner )
                 cvEndFindContours(&scanner);
-            CV_RETHROW();
+            throw;
         }
 
         *firstContour = cvEndFindContours( &scanner );
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index 27a3ae7fa8..6bcc0bf451 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -246,11 +246,11 @@ namespace cv
                 ivx::IVX_CHECK_STATUS(vxuSobel3x3(ctx, ia, NULL, ib));
             ctx.setImmediateBorder(prevBorder);
         }
-        catch (ivx::RuntimeError & e)
+        catch (const ivx::RuntimeError & e)
         {
             VX_DbgThrow(e.what());
         }
-        catch (ivx::WrapperError & e)
+        catch (const ivx::WrapperError & e)
         {
             VX_DbgThrow(e.what());
         }
diff --git a/modules/imgproc/src/featureselect.cpp b/modules/imgproc/src/featureselect.cpp
index 40d5551667..26c20ea832 100644
--- a/modules/imgproc/src/featureselect.cpp
+++ b/modules/imgproc/src/featureselect.cpp
@@ -338,11 +338,11 @@ static bool openvx_harris(Mat image, OutputArray _corners,
         ovxImage.swapHandle();
 #endif
     }
-    catch (RuntimeError & e)
+    catch (const RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (WrapperError & e)
+    catch (const WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 6e7d2e627d..1c80e847c5 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -793,11 +793,11 @@ namespace cv
             img.swapHandle();
 #endif
         }
-        catch (ivx::RuntimeError & e)
+        catch (const ivx::RuntimeError & e)
         {
             VX_DbgThrow(e.what());
         }
-        catch (ivx::WrapperError & e)
+        catch (const ivx::WrapperError & e)
         {
             VX_DbgThrow(e.what());
         }
@@ -3313,11 +3313,11 @@ static bool openvx_equalize_hist(Mat srcMat, Mat dstMat)
         srcImage.swapHandle(); dstImage.swapHandle();
 #endif
     }
-    catch (RuntimeError & e)
+    catch (const RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (WrapperError & e)
+    catch (const WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 6571753bd6..e7b1d50151 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1598,12 +1598,12 @@ static bool openvx_remap(Mat src, Mat dst, Mat map1, Mat map2, int interpolation
 
         ctx.setImmediateBorder(prevBorder);
     }
-    catch (ivx::RuntimeError & e)
+    catch (const ivx::RuntimeError & e)
     {
         CV_Error(CV_StsInternal, e.what());
         return false;
     }
-    catch (ivx::WrapperError & e)
+    catch (const ivx::WrapperError & e)
     {
         CV_Error(CV_StsInternal, e.what());
         return false;
diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.cpp
index 712cc731a7..24002d3b83 100644
--- a/modules/imgproc/src/median_blur.cpp
+++ b/modules/imgproc/src/median_blur.cpp
@@ -1068,11 +1068,11 @@ static bool openvx_medianFilter(InputArray _src, OutputArray _dst, int ksize)
 #endif
         ctx.setImmediateBorder(prevBorder);
     }
-    catch (ivx::RuntimeError & e)
+    catch (const ivx::RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (ivx::WrapperError & e)
+    catch (const ivx::WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 3d00a5168b..d212237a37 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -861,11 +861,11 @@ static bool openvx_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz,
         srcImg.swapHandle(); dstImg.swapHandle();
 #endif
     }
-    catch (RuntimeError & e)
+    catch (const RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (WrapperError & e)
+    catch (const WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index d36f72f585..4cb1914241 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -54,1640 +54,6 @@
 
 #include "fixedpoint.inl.hpp"
 
-namespace cv
-{
-
-/****************************************************************************************\
-                                         Box Filter
-\****************************************************************************************/
-
-template<typename T, typename ST>
-struct RowSum :
-        public BaseRowFilter
-{
-    RowSum( int _ksize, int _anchor ) :
-        BaseRowFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-    }
-
-    virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
-    {
-        const T* S = (const T*)src;
-        ST* D = (ST*)dst;
-        int i = 0, k, ksz_cn = ksize*cn;
-
-        width = (width - 1)*cn;
-        if( ksize == 3 )
-        {
-            for( i = 0; i < width + cn; i++ )
-            {
-                D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2];
-            }
-        }
-        else if( ksize == 5 )
-        {
-            for( i = 0; i < width + cn; i++ )
-            {
-                D[i] = (ST)S[i] + (ST)S[i+cn] + (ST)S[i+cn*2] + (ST)S[i + cn*3] + (ST)S[i + cn*4];
-            }
-        }
-        else if( cn == 1 )
-        {
-            ST s = 0;
-            for( i = 0; i < ksz_cn; i++ )
-                s += (ST)S[i];
-            D[0] = s;
-            for( i = 0; i < width; i++ )
-            {
-                s += (ST)S[i + ksz_cn] - (ST)S[i];
-                D[i+1] = s;
-            }
-        }
-        else if( cn == 3 )
-        {
-            ST s0 = 0, s1 = 0, s2 = 0;
-            for( i = 0; i < ksz_cn; i += 3 )
-            {
-                s0 += (ST)S[i];
-                s1 += (ST)S[i+1];
-                s2 += (ST)S[i+2];
-            }
-            D[0] = s0;
-            D[1] = s1;
-            D[2] = s2;
-            for( i = 0; i < width; i += 3 )
-            {
-                s0 += (ST)S[i + ksz_cn] - (ST)S[i];
-                s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
-                s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
-                D[i+3] = s0;
-                D[i+4] = s1;
-                D[i+5] = s2;
-            }
-        }
-        else if( cn == 4 )
-        {
-            ST s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-            for( i = 0; i < ksz_cn; i += 4 )
-            {
-                s0 += (ST)S[i];
-                s1 += (ST)S[i+1];
-                s2 += (ST)S[i+2];
-                s3 += (ST)S[i+3];
-            }
-            D[0] = s0;
-            D[1] = s1;
-            D[2] = s2;
-            D[3] = s3;
-            for( i = 0; i < width; i += 4 )
-            {
-                s0 += (ST)S[i + ksz_cn] - (ST)S[i];
-                s1 += (ST)S[i + ksz_cn + 1] - (ST)S[i + 1];
-                s2 += (ST)S[i + ksz_cn + 2] - (ST)S[i + 2];
-                s3 += (ST)S[i + ksz_cn + 3] - (ST)S[i + 3];
-                D[i+4] = s0;
-                D[i+5] = s1;
-                D[i+6] = s2;
-                D[i+7] = s3;
-            }
-        }
-        else
-            for( k = 0; k < cn; k++, S++, D++ )
-            {
-                ST s = 0;
-                for( i = 0; i < ksz_cn; i += cn )
-                    s += (ST)S[i];
-                D[0] = s;
-                for( i = 0; i < width; i += cn )
-                {
-                    s += (ST)S[i + ksz_cn] - (ST)S[i];
-                    D[i+cn] = s;
-                }
-            }
-    }
-};
-
-
-template<typename ST, typename T>
-struct ColumnSum :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int i;
-        ST* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(ST));
-
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const ST* Sp = (const ST*)src[0];
-
-                for( i = 0; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-
-        for( ; count--; src++ )
-        {
-            const ST* Sp = (const ST*)src[0];
-            const ST* Sm = (const ST*)src[1-ksize];
-            T* D = (T*)dst;
-            if( haveScale )
-            {
-                for( i = 0; i <= width - 2; i += 2 )
-                {
-                    ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
-                    D[i] = saturate_cast<T>(s0*_scale);
-                    D[i+1] = saturate_cast<T>(s1*_scale);
-                    s0 -= Sm[i]; s1 -= Sm[i+1];
-                    SUM[i] = s0; SUM[i+1] = s1;
-                }
-
-                for( ; i < width; i++ )
-                {
-                    ST s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<T>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                for( i = 0; i <= width - 2; i += 2 )
-                {
-                    ST s0 = SUM[i] + Sp[i], s1 = SUM[i+1] + Sp[i+1];
-                    D[i] = saturate_cast<T>(s0);
-                    D[i+1] = saturate_cast<T>(s1);
-                    s0 -= Sm[i]; s1 -= Sm[i+1];
-                    SUM[i] = s0; SUM[i+1] = s1;
-                }
-
-                for( ; i < width; i++ )
-                {
-                    ST s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<T>(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-    }
-
-    double scale;
-    int sumCount;
-    std::vector<ST> sum;
-};
-
-
-template<>
-struct ColumnSum<int, uchar> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-
-#if CV_SIMD128
-        bool haveSIMD128 = hasSIMD128();
-#endif
-
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for (; i <= width - 4; i += 4)
-                    {
-                        v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-
-        for( ; count--; src++ )
-        {
-            const int* Sp = (const int*)src[0];
-            const int* Sm = (const int*)src[1-ksize];
-            uchar* D = (uchar*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-
-                    v_float32x4 v_scale = v_setall_f32((float)_scale);
-                    for( ; i <= width-8; i+=8 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4);
-
-                        v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                        v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
-
-                        v_uint16x8 v_dst = v_pack(v_s0d, v_s01d);
-                        v_pack_store(D + i, v_dst);
-
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                        v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<uchar>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width-8; i+=8 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4);
-
-                        v_uint16x8 v_dst = v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01));
-                        v_pack_store(D + i, v_dst);
-
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                        v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4));
-                    }
-                }
-#endif
-
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<uchar>(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-    }
-
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-
-
-template<>
-struct ColumnSum<ushort, uchar> :
-public BaseColumnFilter
-{
-    enum { SHIFT = 23 };
-
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-    BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-        divDelta = 0;
-        divScale = 1;
-        if( scale != 1 )
-        {
-            int d = cvRound(1./scale);
-            double scalef = ((double)(1 << SHIFT))/d;
-            divScale = cvFloor(scalef);
-            scalef -= divScale;
-            divDelta = d/2;
-            if( scalef < 0.5 )
-                divDelta++;
-            else
-                divScale++;
-        }
-    }
-
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        const int ds = divScale;
-        const int dd = divDelta;
-        ushort* SUM;
-        const bool haveScale = scale != 1;
-
-#if CV_SIMD128
-        bool haveSIMD128 = hasSIMD128();
-#endif
-
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(SUM[0]));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const ushort* Sp = (const ushort*)src[0];
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width - 8; i += 8 )
-                    {
-                        v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-
-        for( ; count--; src++ )
-        {
-            const ushort* Sp = (const ushort*)src[0];
-            const ushort* Sm = (const ushort*)src[1-ksize];
-            uchar* D = (uchar*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-            #if CV_SIMD128
-                v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
-                v_uint16x8 dd8 = v_setall_u16((ushort)dd);
-
-                for( ; i <= width-16; i+=16 )
-                {
-                    v_uint16x8 _sm0 = v_load(Sm + i);
-                    v_uint16x8 _sm1 = v_load(Sm + i + 8);
-
-                    v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
-                    v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + 8), v_load(Sp + i + 8));
-
-                    v_uint32x4 _s00, _s01, _s10, _s11;
-
-                    v_expand(_s0 + dd8, _s00, _s01);
-                    v_expand(_s1 + dd8, _s10, _s11);
-
-                    _s00 = v_shr<SHIFT>(_s00*ds4);
-                    _s01 = v_shr<SHIFT>(_s01*ds4);
-                    _s10 = v_shr<SHIFT>(_s10*ds4);
-                    _s11 = v_shr<SHIFT>(_s11*ds4);
-
-                    v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
-                    v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
-
-                    _s0 = v_sub_wrap(_s0, _sm0);
-                    _s1 = v_sub_wrap(_s1, _sm1);
-
-                    v_store(D + i, v_pack_u(r0, r1));
-                    v_store(SUM + i, _s0);
-                    v_store(SUM + i + 8, _s1);
-                }
-            #endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = (uchar)((s0 + dd)*ds >> SHIFT);
-                    SUM[i] = (ushort)(s0 - Sm[i]);
-                }
-            }
-            else
-            {
-                int i = 0;
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<uchar>(s0);
-                    SUM[i] = (ushort)(s0 - Sm[i]);
-                }
-            }
-            dst += dststep;
-        }
-    }
-
-    double scale;
-    int sumCount;
-    int divDelta;
-    int divScale;
-    std::vector<ushort> sum;
-};
-
-
-template<>
-struct ColumnSum<int, short> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int i;
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-
-#if CV_SIMD128
-        bool haveSIMD128 = hasSIMD128();
-#endif
-
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width - 4; i+=4 )
-                    {
-                        v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                    }
-                }
-                #endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-
-        for( ; count--; src++ )
-        {
-            const int* Sp = (const int*)src[0];
-            const int* Sm = (const int*)src[1-ksize];
-            short* D = (short*)dst;
-            if( haveScale )
-            {
-                i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    v_float32x4 v_scale = v_setall_f32((float)_scale);
-                    for( ; i <= width-8; i+=8 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4);
-
-                        v_int32x4 v_s0d =  v_round(v_cvt_f32(v_s0) * v_scale);
-                        v_int32x4 v_s01d = v_round(v_cvt_f32(v_s01) * v_scale);
-                        v_store(D + i, v_pack(v_s0d, v_s01d));
-
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                        v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<short>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width-8; i+=8 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4);
-
-                        v_store(D + i, v_pack(v_s0, v_s01));
-
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                        v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4));
-                    }
-                }
-#endif
-
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<short>(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-    }
-
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-
-
-template<>
-struct ColumnSum<int, ushort> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-
-#if CV_SIMD128
-        bool haveSIMD128 = hasSIMD128();
-#endif
-
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for (; i <= width - 4; i += 4)
-                    {
-                        v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-
-        for( ; count--; src++ )
-        {
-            const int* Sp = (const int*)src[0];
-            const int* Sm = (const int*)src[1-ksize];
-            ushort* D = (ushort*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    v_float32x4 v_scale = v_setall_f32((float)_scale);
-                    for( ; i <= width-8; i+=8 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4);
-
-                        v_uint32x4 v_s0d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s0) * v_scale));
-                        v_uint32x4 v_s01d = v_reinterpret_as_u32(v_round(v_cvt_f32(v_s01) * v_scale));
-                        v_store(D + i, v_pack(v_s0d, v_s01d));
-
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                        v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<ushort>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width-8; i+=8 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4);
-
-                        v_store(D + i, v_pack(v_reinterpret_as_u32(v_s0), v_reinterpret_as_u32(v_s01)));
-
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                        v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<ushort>(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-    }
-
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-
-template<>
-struct ColumnSum<int, int> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-
-#if CV_SIMD128
-        bool haveSIMD128 = hasSIMD128();
-#endif
-
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width - 4; i+=4 )
-                    {
-                        v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-
-        for( ; count--; src++ )
-        {
-            const int* Sp = (const int*)src[0];
-            const int* Sm = (const int*)src[1-ksize];
-            int* D = (int*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    v_float32x4 v_scale = v_setall_f32((float)_scale);
-                    for( ; i <= width-4; i+=4 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s0d = v_round(v_cvt_f32(v_s0) * v_scale);
-
-                        v_store(D + i, v_s0d);
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = saturate_cast<int>(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width-4; i+=4 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-
-                        v_store(D + i, v_s0);
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = s0;
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-    }
-
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-
-
-template<>
-struct ColumnSum<int, float> :
-        public BaseColumnFilter
-{
-    ColumnSum( int _ksize, int _anchor, double _scale ) :
-        BaseColumnFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-        scale = _scale;
-        sumCount = 0;
-    }
-
-    virtual void reset() CV_OVERRIDE { sumCount = 0; }
-
-    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE
-    {
-        int* SUM;
-        bool haveScale = scale != 1;
-        double _scale = scale;
-
-#if CV_SIMD128
-        bool haveSIMD128 = hasSIMD128();
-#endif
-
-        if( width != (int)sum.size() )
-        {
-            sum.resize(width);
-            sumCount = 0;
-        }
-
-        SUM = &sum[0];
-        if( sumCount == 0 )
-        {
-            memset((void*)SUM, 0, width*sizeof(int));
-            for( ; sumCount < ksize - 1; sumCount++, src++ )
-            {
-                const int* Sp = (const int*)src[0];
-                int i = 0;
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width - 4; i+=4 )
-                    {
-                        v_store(SUM + i, v_load(SUM + i) + v_load(Sp + i));
-                    }
-                }
-#endif
-
-                for( ; i < width; i++ )
-                    SUM[i] += Sp[i];
-            }
-        }
-        else
-        {
-            CV_Assert( sumCount == ksize-1 );
-            src += ksize-1;
-        }
-
-        for( ; count--; src++ )
-        {
-            const int * Sp = (const int*)src[0];
-            const int * Sm = (const int*)src[1-ksize];
-            float* D = (float*)dst;
-            if( haveScale )
-            {
-                int i = 0;
-
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    v_float32x4 v_scale = v_setall_f32((float)_scale);
-                    for (; i <= width - 8; i += 8)
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4);
-
-                        v_store(D + i, v_cvt_f32(v_s0) * v_scale);
-                        v_store(D + i + 4, v_cvt_f32(v_s01) * v_scale);
-
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                        v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = (float)(s0*_scale);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            else
-            {
-                int i = 0;
-
-#if CV_SIMD128
-                if( haveSIMD128 )
-                {
-                    for( ; i <= width-8; i+=8 )
-                    {
-                        v_int32x4 v_s0 = v_load(SUM + i) + v_load(Sp + i);
-                        v_int32x4 v_s01 = v_load(SUM + i + 4) + v_load(Sp + i + 4);
-
-                        v_store(D + i, v_cvt_f32(v_s0));
-                        v_store(D + i + 4, v_cvt_f32(v_s01));
-
-                        v_store(SUM + i, v_s0 - v_load(Sm + i));
-                        v_store(SUM + i + 4, v_s01 - v_load(Sm + i + 4));
-                    }
-                }
-#endif
-                for( ; i < width; i++ )
-                {
-                    int s0 = SUM[i] + Sp[i];
-                    D[i] = (float)(s0);
-                    SUM[i] = s0 - Sm[i];
-                }
-            }
-            dst += dststep;
-        }
-    }
-
-    double scale;
-    int sumCount;
-    std::vector<int> sum;
-};
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth,
-                                   Size ksize, Point anchor, int borderType, bool normalize )
-{
-    const ocl::Device & dev = ocl::Device::getDefault();
-    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-
-    if (ddepth < 0)
-        ddepth = sdepth;
-
-    if (anchor.x < 0)
-        anchor.x = ksize.width / 2;
-    if (anchor.y < 0)
-        anchor.y = ksize.height / 2;
-
-    if ( !(dev.isIntel() && (type == CV_8UC1) &&
-         (_src.offset() == 0) && (_src.step() % 4 == 0) &&
-         (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) &&
-         (anchor.x == 1) && (anchor.y == 1) &&
-         (ksize.width == 3) && (ksize.height == 3)) )
-        return false;
-
-    float alpha = 1.0f / (ksize.height * ksize.width);
-    Size size = _src.size();
-    size_t globalsize[2] = { 0, 0 };
-    size_t localsize[2] = { 0, 0 };
-    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
-
-    globalsize[0] = size.width / 16;
-    globalsize[1] = size.height / 2;
-
-    char build_opts[1024];
-    sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : "");
-
-    ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts);
-    if (kernel.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    _dst.create(size, CV_MAKETYPE(ddepth, cn));
-    if (!(_dst.offset() == 0 && _dst.step() % 4 == 0))
-        return false;
-    UMat dst = _dst.getUMat();
-
-    int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
-    idxArg = kernel.set(idxArg, (int)src.step);
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
-    idxArg = kernel.set(idxArg, (int)dst.step);
-    idxArg = kernel.set(idxArg, (int)dst.rows);
-    idxArg = kernel.set(idxArg, (int)dst.cols);
-    if (normalize)
-        idxArg = kernel.set(idxArg, (float)alpha);
-
-    return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false);
-}
-
-#define DIVUP(total, grain) ((total + grain - 1) / (grain))
-#define ROUNDUP(sz, n)      ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
-
-static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
-                           Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
-{
-    const ocl::Device & dev = ocl::Device::getDefault();
-    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
-    bool doubleSupport = dev.doubleFPConfig() > 0;
-
-    if (ddepth < 0)
-        ddepth = sdepth;
-
-    if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
-        _src.offset() % esz != 0 || _src.step() % esz != 0)
-        return false;
-
-    if (anchor.x < 0)
-        anchor.x = ksize.width / 2;
-    if (anchor.y < 0)
-        anchor.y = ksize.height / 2;
-
-    int computeUnits = ocl::Device::getDefault().maxComputeUnits();
-    float alpha = 1.0f / (ksize.height * ksize.width);
-    Size size = _src.size(), wholeSize;
-    bool isolated = (borderType & BORDER_ISOLATED) != 0;
-    borderType &= ~BORDER_ISOLATED;
-    int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)),
-        wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn);
-
-    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
-    size_t globalsize[2] = { (size_t)size.width, (size_t)size.height };
-    size_t localsize_general[2] = { 0, 1 }, * localsize = NULL;
-
-    UMat src = _src.getUMat();
-    if (!isolated)
-    {
-        Point ofs;
-        src.locateROI(wholeSize, ofs);
-    }
-
-    int h = isolated ? size.height : wholeSize.height;
-    int w = isolated ? size.width : wholeSize.width;
-
-    size_t maxWorkItemSizes[32];
-    ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
-    int tryWorkItems = (int)maxWorkItemSizes[0];
-
-    ocl::Kernel kernel;
-
-    if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) &&
-        ((ksize.width < 5 && ksize.height < 5 && esz <= 4) ||
-         (ksize.width == 5 && ksize.height == 5 && cn == 1)))
-    {
-        if (w < ksize.width || h < ksize.height)
-            return false;
-
-        // Figure out what vector size to use for loading the pixels.
-        int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4;
-        int pxLoadVecSize = cn * pxLoadNumPixels;
-
-        // Figure out how many pixels per work item to compute in X and Y
-        // directions.  Too many and we run out of registers.
-        int pxPerWorkItemX = 1, pxPerWorkItemY = 1;
-        if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
-        {
-            pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8;
-            pxPerWorkItemY = size.height % 2 ? 1 : 2;
-        }
-        else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
-        {
-            pxPerWorkItemX = size.width % 2 ? 1 : 2;
-            pxPerWorkItemY = size.height % 2 ? 1 : 2;
-        }
-        globalsize[0] = size.width / pxPerWorkItemX;
-        globalsize[1] = size.height / pxPerWorkItemY;
-
-        // Need some padding in the private array for pixels
-        int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
-
-        // Make the global size a nice round number so the runtime can pick
-        // from reasonable choices for the workgroup size
-        const int wgRound = 256;
-        globalsize[0] = ROUNDUP(globalsize[0], wgRound);
-
-        char build_options[1024], cvt[2][40];
-        sprintf(build_options, "-D cn=%d "
-                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
-                "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
-                "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
-                "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
-                "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
-                "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER",
-                cn, anchor.x, anchor.y, ksize.width, ksize.height,
-                pxLoadVecSize, pxLoadNumPixels,
-                pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
-                isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
-                privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
-                ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
-                ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
-                ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
-                ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
-                normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
-                ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV
-                );
-
-
-        if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options))
-            return false;
-    }
-    else
-    {
-        localsize = localsize_general;
-        for ( ; ; )
-        {
-            int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
-
-            while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
-                BLOCK_SIZE_X /= 2;
-            while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
-                BLOCK_SIZE_Y *= 2;
-
-            if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
-                return false;
-
-            char cvt[2][50];
-            String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
-                                 " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
-                                 " -D ST1=%s -D DT1=%s -D cn=%d",
-                                 BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
-                                 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
-                                 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
-                                 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
-                                 anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
-                                 isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                                 normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
-                                 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
-
-            localsize[0] = BLOCK_SIZE_X;
-            globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
-            globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y);
-
-            kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
-            if (kernel.empty())
-                return false;
-
-            size_t kernelWorkGroupSize = kernel.workGroupSize();
-            if (localsize[0] <= kernelWorkGroupSize)
-                break;
-            if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
-                return false;
-
-            tryWorkItems = (int)kernelWorkGroupSize;
-        }
-    }
-
-    _dst.create(size, CV_MAKETYPE(ddepth, cn));
-    UMat dst = _dst.getUMat();
-
-    int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
-    idxArg = kernel.set(idxArg, (int)src.step);
-    int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
-    int srcOffsetY = (int)(src.offset / src.step);
-    int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
-    int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
-    idxArg = kernel.set(idxArg, srcOffsetX);
-    idxArg = kernel.set(idxArg, srcOffsetY);
-    idxArg = kernel.set(idxArg, srcEndX);
-    idxArg = kernel.set(idxArg, srcEndY);
-    idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
-    if (normalize)
-        idxArg = kernel.set(idxArg, (float)alpha);
-
-    return kernel.run(2, globalsize, localsize, false);
-}
-
-#undef DIVUP
-#undef ROUNDUP
-
-#endif
-
-}
-
-
-cv::Ptr<cv::BaseRowFilter> cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)
-{
-    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
-    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
-
-    if( anchor < 0 )
-        anchor = ksize/2;
-
-    if( sdepth == CV_8U && ddepth == CV_32S )
-        return makePtr<RowSum<uchar, int> >(ksize, anchor);
-    if( sdepth == CV_8U && ddepth == CV_16U )
-        return makePtr<RowSum<uchar, ushort> >(ksize, anchor);
-    if( sdepth == CV_8U && ddepth == CV_64F )
-        return makePtr<RowSum<uchar, double> >(ksize, anchor);
-    if( sdepth == CV_16U && ddepth == CV_32S )
-        return makePtr<RowSum<ushort, int> >(ksize, anchor);
-    if( sdepth == CV_16U && ddepth == CV_64F )
-        return makePtr<RowSum<ushort, double> >(ksize, anchor);
-    if( sdepth == CV_16S && ddepth == CV_32S )
-        return makePtr<RowSum<short, int> >(ksize, anchor);
-    if( sdepth == CV_32S && ddepth == CV_32S )
-        return makePtr<RowSum<int, int> >(ksize, anchor);
-    if( sdepth == CV_16S && ddepth == CV_64F )
-        return makePtr<RowSum<short, double> >(ksize, anchor);
-    if( sdepth == CV_32F && ddepth == CV_64F )
-        return makePtr<RowSum<float, double> >(ksize, anchor);
-    if( sdepth == CV_64F && ddepth == CV_64F )
-        return makePtr<RowSum<double, double> >(ksize, anchor);
-
-    CV_Error_( CV_StsNotImplemented,
-        ("Unsupported combination of source format (=%d), and buffer format (=%d)",
-        srcType, sumType));
-}
-
-
-cv::Ptr<cv::BaseColumnFilter> cv::getColumnSumFilter(int sumType, int dstType, int ksize,
-                                                     int anchor, double scale)
-{
-    int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType);
-    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) );
-
-    if( anchor < 0 )
-        anchor = ksize/2;
-
-    if( ddepth == CV_8U && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, uchar> >(ksize, anchor, scale);
-    if( ddepth == CV_8U && sdepth == CV_16U )
-        return makePtr<ColumnSum<ushort, uchar> >(ksize, anchor, scale);
-    if( ddepth == CV_8U && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, uchar> >(ksize, anchor, scale);
-    if( ddepth == CV_16U && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, ushort> >(ksize, anchor, scale);
-    if( ddepth == CV_16U && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, ushort> >(ksize, anchor, scale);
-    if( ddepth == CV_16S && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, short> >(ksize, anchor, scale);
-    if( ddepth == CV_16S && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, short> >(ksize, anchor, scale);
-    if( ddepth == CV_32S && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, int> >(ksize, anchor, scale);
-    if( ddepth == CV_32F && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, float> >(ksize, anchor, scale);
-    if( ddepth == CV_32F && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, float> >(ksize, anchor, scale);
-    if( ddepth == CV_64F && sdepth == CV_32S )
-        return makePtr<ColumnSum<int, double> >(ksize, anchor, scale);
-    if( ddepth == CV_64F && sdepth == CV_64F )
-        return makePtr<ColumnSum<double, double> >(ksize, anchor, scale);
-
-    CV_Error_( CV_StsNotImplemented,
-        ("Unsupported combination of sum format (=%d), and destination format (=%d)",
-        sumType, dstType));
-}
-
-
-cv::Ptr<cv::FilterEngine> cv::createBoxFilter( int srcType, int dstType, Size ksize,
-                    Point anchor, bool normalize, int borderType )
-{
-    int sdepth = CV_MAT_DEPTH(srcType);
-    int cn = CV_MAT_CN(srcType), sumType = CV_64F;
-    if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U &&
-        ksize.width*ksize.height <= 256 )
-        sumType = CV_16U;
-    else if( sdepth <= CV_32S && (!normalize ||
-        ksize.width*ksize.height <= (sdepth == CV_8U ? (1<<23) :
-            sdepth == CV_16U ? (1 << 15) : (1 << 16))) )
-        sumType = CV_32S;
-    sumType = CV_MAKETYPE( sumType, cn );
-
-    Ptr<BaseRowFilter> rowFilter = getRowSumFilter(srcType, sumType, ksize.width, anchor.x );
-    Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
-        dstType, ksize.height, anchor.y, normalize ? 1./(ksize.width*ksize.height) : 1);
-
-    return makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
-           srcType, dstType, sumType, borderType );
-}
-
-#ifdef HAVE_OPENVX
-namespace cv
-{
-    namespace ovx {
-        template <> inline bool skipSmallImages<VX_KERNEL_BOX_3x3>(int w, int h) { return w*h < 640 * 480; }
-    }
-    static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth,
-                                 Size ksize, Point anchor,
-                                 bool normalize, int borderType)
-    {
-        if (ddepth < 0)
-            ddepth = CV_8UC1;
-        if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize ||
-            _src.cols() < 3 || _src.rows() < 3 ||
-            ksize.width != 3 || ksize.height != 3 ||
-            (anchor.x >= 0 && anchor.x != 1) ||
-            (anchor.y >= 0 && anchor.y != 1) ||
-            ovx::skipSmallImages<VX_KERNEL_BOX_3x3>(_src.cols(), _src.rows()))
-            return false;
-
-        Mat src = _src.getMat();
-
-        if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix())
-            return false; //Process isolated borders only
-        vx_enum border;
-        switch (borderType & ~BORDER_ISOLATED)
-        {
-        case BORDER_CONSTANT:
-            border = VX_BORDER_CONSTANT;
-            break;
-        case BORDER_REPLICATE:
-            border = VX_BORDER_REPLICATE;
-            break;
-        default:
-            return false;
-        }
-
-        _dst.create(src.size(), CV_8UC1);
-        Mat dst = _dst.getMat();
-
-        try
-        {
-            ivx::Context ctx = ovx::getOpenVXContext();
-
-            Mat a;
-            if (dst.data != src.data)
-                a = src;
-            else
-                src.copyTo(a);
-
-            ivx::Image
-                ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
-                                                  ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data),
-                ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8,
-                                                  ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data);
-
-            //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments
-            //since OpenVX standard says nothing about thread-safety for now
-            ivx::border_t prevBorder = ctx.immediateBorder();
-            ctx.setImmediateBorder(border, (vx_uint8)(0));
-            ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib));
-            ctx.setImmediateBorder(prevBorder);
-        }
-        catch (ivx::RuntimeError & e)
-        {
-            VX_DbgThrow(e.what());
-        }
-        catch (ivx::WrapperError & e)
-        {
-            VX_DbgThrow(e.what());
-        }
-
-        return true;
-    }
-}
-#endif
-
-#if 0 //defined(HAVE_IPP)
-namespace cv
-{
-static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType)
-{
-#ifdef HAVE_IPP_IW
-    CV_INSTRUMENT_REGION_IPP();
-
-#if IPP_VERSION_X100 < 201801
-    // Problem with SSE42 optimization for 16s and some 8u modes
-    if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5))))
-        return false;
-
-    // Other optimizations has some degradations too
-    if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5))))
-        return false;
-#endif
-
-    if(!normalize)
-        return false;
-
-    if(!ippiCheckAnchor(anchor, ksize))
-        return false;
-
-    try
-    {
-        ::ipp::IwiImage       iwSrc      = ippiGetImage(src);
-        ::ipp::IwiImage       iwDst      = ippiGetImage(dst);
-        ::ipp::IwiSize        iwKSize    = ippiGetSize(ksize);
-        ::ipp::IwiBorderSize  borderSize(iwKSize);
-        ::ipp::IwiBorderType  ippBorder(ippiGetBorder(iwSrc, borderType, borderSize));
-        if(!ippBorder)
-            return false;
-
-        CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder);
-    }
-    catch (const ::ipp::IwException &)
-    {
-        return false;
-    }
-
-    return true;
-#else
-    CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType);
-    return false;
-#endif
-}
-}
-#endif
-
-
-void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
-                Size ksize, Point anchor,
-                bool normalize, int borderType )
-{
-    CV_INSTRUMENT_REGION();
-
-    CV_OCL_RUN(_dst.isUMat() &&
-               (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT ||
-                borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101),
-               ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
-
-    CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
-
-    Mat src = _src.getMat();
-    int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
-    if( ddepth < 0 )
-        ddepth = sdepth;
-    _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
-    Mat dst = _dst.getMat();
-    if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
-    {
-        if( src.rows == 1 )
-            ksize.height = 1;
-        if( src.cols == 1 )
-            ksize.width = 1;
-    }
-
-    Point ofs;
-    Size wsz(src.cols, src.rows);
-    if(!(borderType&BORDER_ISOLATED))
-        src.locateROI( wsz, ofs );
-
-    CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn,
-             ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
-             anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED);
-
-    CV_OVX_RUN(true,
-               openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType))
-
-    //CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType));
-
-    borderType = (borderType&~BORDER_ISOLATED);
-
-    Ptr<FilterEngine> f = createBoxFilter( src.type(), dst.type(),
-                        ksize, anchor, normalize, borderType );
-
-    f->apply( src, dst, wsz, ofs );
-}
-
-
-void cv::blur( InputArray src, OutputArray dst,
-           Size ksize, Point anchor, int borderType )
-{
-    CV_INSTRUMENT_REGION();
-
-    boxFilter( src, dst, -1, ksize, anchor, true, borderType );
-}
-
-
-/****************************************************************************************\
-                                    Squared Box Filter
-\****************************************************************************************/
-
-namespace cv
-{
-
-template<typename T, typename ST>
-struct SqrRowSum :
-        public BaseRowFilter
-{
-    SqrRowSum( int _ksize, int _anchor ) :
-        BaseRowFilter()
-    {
-        ksize = _ksize;
-        anchor = _anchor;
-    }
-
-    virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE
-    {
-        const T* S = (const T*)src;
-        ST* D = (ST*)dst;
-        int i = 0, k, ksz_cn = ksize*cn;
-
-        width = (width - 1)*cn;
-        for( k = 0; k < cn; k++, S++, D++ )
-        {
-            ST s = 0;
-            for( i = 0; i < ksz_cn; i += cn )
-            {
-                ST val = (ST)S[i];
-                s += val*val;
-            }
-            D[0] = s;
-            for( i = 0; i < width; i += cn )
-            {
-                ST val0 = (ST)S[i], val1 = (ST)S[i + ksz_cn];
-                s += val1*val1 - val0*val0;
-                D[i+cn] = s;
-            }
-        }
-    }
-};
-
-static Ptr<BaseRowFilter> getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor)
-{
-    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType);
-    CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) );
-
-    if( anchor < 0 )
-        anchor = ksize/2;
-
-    if( sdepth == CV_8U && ddepth == CV_32S )
-        return makePtr<SqrRowSum<uchar, int> >(ksize, anchor);
-    if( sdepth == CV_8U && ddepth == CV_64F )
-        return makePtr<SqrRowSum<uchar, double> >(ksize, anchor);
-    if( sdepth == CV_16U && ddepth == CV_64F )
-        return makePtr<SqrRowSum<ushort, double> >(ksize, anchor);
-    if( sdepth == CV_16S && ddepth == CV_64F )
-        return makePtr<SqrRowSum<short, double> >(ksize, anchor);
-    if( sdepth == CV_32F && ddepth == CV_64F )
-        return makePtr<SqrRowSum<float, double> >(ksize, anchor);
-    if( sdepth == CV_64F && ddepth == CV_64F )
-        return makePtr<SqrRowSum<double, double> >(ksize, anchor);
-
-    CV_Error_( CV_StsNotImplemented,
-              ("Unsupported combination of source format (=%d), and buffer format (=%d)",
-               srcType, sumType));
-}
-
-}
-
-void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
-                       Size ksize, Point anchor,
-                       bool normalize, int borderType )
-{
-    CV_INSTRUMENT_REGION();
-
-    int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
-    Size size = _src.size();
-
-    if( ddepth < 0 )
-        ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
-
-    if( borderType != BORDER_CONSTANT && normalize )
-    {
-        if( size.height == 1 )
-            ksize.height = 1;
-        if( size.width == 1 )
-            ksize.width = 1;
-    }
-
-    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
-               ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
-
-    int sumDepth = CV_64F;
-    if( sdepth == CV_8U )
-        sumDepth = CV_32S;
-    int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
-
-    Mat src = _src.getMat();
-    _dst.create( size, dstType );
-    Mat dst = _dst.getMat();
-
-    Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
-    Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
-                                                            dstType, ksize.height, anchor.y,
-                                                            normalize ? 1./(ksize.width*ksize.height) : 1);
-
-    Ptr<FilterEngine> f = makePtr<FilterEngine>(Ptr<BaseFilter>(), rowFilter, columnFilter,
-                                                srcType, dstType, sumType, borderType );
-    Point ofs;
-    Size wsz(src.cols, src.rows);
-    src.locateROI( wsz, ofs );
-
-    f->apply( src, dst, wsz, ofs );
-}
-
-
 /****************************************************************************************\
                                      Gaussian Blur
 \****************************************************************************************/
@@ -3939,11 +2305,11 @@ static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
         ivx::IVX_CHECK_STATUS(vxuGaussian3x3(ctx, ia, ib));
         ctx.setImmediateBorder(prevBorder);
     }
-    catch (ivx::RuntimeError & e)
+    catch (const ivx::RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (ivx::WrapperError & e)
+    catch (const ivx::WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index c73574a2e0..7c5bb163f6 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -1357,11 +1357,11 @@ static bool openvx_threshold(Mat src, Mat dst, int thresh, int maxval, int type)
         }
 #endif
     }
-    catch (ivx::RuntimeError & e)
+    catch (const ivx::RuntimeError & e)
     {
         VX_DbgThrow(e.what());
     }
-    catch (ivx::WrapperError & e)
+    catch (const ivx::WrapperError & e)
     {
         VX_DbgThrow(e.what());
     }
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index 5c870971de..1cbbf8456f 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -713,7 +713,6 @@ protected:
 };
 
 //! @} objdetect
-
 }
 
 #include "opencv2/objdetect/detection_based_tracker.hpp"
diff --git a/modules/objdetect/src/detection_based_tracker.cpp b/modules/objdetect/src/detection_based_tracker.cpp
index d4e5ee755e..29504a43fc 100644
--- a/modules/objdetect/src/detection_based_tracker.cpp
+++ b/modules/objdetect/src/detection_based_tracker.cpp
@@ -209,23 +209,23 @@ bool cv::DetectionBasedTracker::SeparateDetectionWork::run()
 }
 
 #define CATCH_ALL_AND_LOG(_block)                                                           \
-    CV_TRY {                                                                                   \
+    try {                                                                                   \
         _block;                                                                             \
     }                                                                                       \
-    CV_CATCH(cv::Exception, e) {                                                               \
+    catch(const cv::Exception& e) {                                                         \
         LOGE0("\n %s: ERROR: OpenCV Exception caught: \n'%s'\n\n", CV_Func, e.what());      \
-    } CV_CATCH(std::exception, e) {                                                            \
+    } catch(const std::exception& e) {                                                      \
         LOGE0("\n %s: ERROR: Exception caught: \n'%s'\n\n", CV_Func, e.what());             \
-    } CV_CATCH_ALL {                                                                          \
+    } catch(...) {                                                                          \
         LOGE0("\n %s: ERROR: UNKNOWN Exception caught\n\n", CV_Func);                       \
     }
 
 void* cv::workcycleObjectDetectorFunction(void* p)
 {
     CATCH_ALL_AND_LOG({ ((cv::DetectionBasedTracker::SeparateDetectionWork*)p)->workcycleObjectDetector(); });
-    CV_TRY{
+    try{
         ((cv::DetectionBasedTracker::SeparateDetectionWork*)p)->init();
-    } CV_CATCH_ALL {
+    } catch(...) {
         LOGE0("DetectionBasedTracker: workcycleObjectDetectorFunction: ERROR concerning pointer, received as the function parameter");
     }
     return NULL;
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 63a507310c..bc2f9664f8 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -1059,8 +1059,8 @@ bool QRDecode::fullDecodingProcess()
 #endif
 }
 
-CV_EXPORTS std::string QRCodeDetector::decode(InputArray in, InputArray points,
-                                              OutputArray straight_qrcode)
+std::string QRCodeDetector::decode(InputArray in, InputArray points,
+                                   OutputArray straight_qrcode)
 {
     Mat inarr = in.getMat();
     CV_Assert(!inarr.empty());
@@ -1096,9 +1096,9 @@ CV_EXPORTS std::string QRCodeDetector::decode(InputArray in, InputArray points,
     return ok ? decoded_info : std::string();
 }
 
-CV_EXPORTS std::string QRCodeDetector::detectAndDecode(InputArray in,
-                                                       OutputArray points_,
-                                                       OutputArray straight_qrcode)
+std::string QRCodeDetector::detectAndDecode(InputArray in,
+                                            OutputArray points_,
+                                            OutputArray straight_qrcode)
 {
     Mat inarr = in.getMat();
     CV_Assert(!inarr.empty());
@@ -1127,5 +1127,4 @@ CV_EXPORTS std::string QRCodeDetector::detectAndDecode(InputArray in,
     return decoded_info;
 }
 
-
 }
diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp
index 863978bd56..b5cea3e46d 100644
--- a/modules/ts/include/opencv2/ts/ts_ext.hpp
+++ b/modules/ts/include/opencv2/ts/ts_ext.hpp
@@ -39,7 +39,7 @@ extern int testThreads;
           Body(); \
           CV__TEST_CLEANUP \
        } \
-       catch (cvtest::SkipTestException& e) \
+       catch (const cvtest::SkipTestException& e) \
        { \
           printf("[     SKIP ] %s\n", e.what()); \
        } \
@@ -87,7 +87,7 @@ extern int testThreads;
           Body(); \
           CV__TEST_CLEANUP \
        } \
-       catch (cvtest::SkipTestException& e) \
+       catch (const cvtest::SkipTestException& e) \
        { \
           printf("[     SKIP ] %s\n", e.what()); \
        } \
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index 59d2e929df..0346b0e658 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -232,7 +232,7 @@ void Regression::init(const std::string& testSuitName, const std::string& ext)
             storageOutPath += ext;
         }
     }
-    catch(cv::Exception&)
+    catch(const cv::Exception&)
     {
         LOGE("Failed to open sanity data for reading: %s", storageInPath.c_str());
     }
@@ -1987,22 +1987,22 @@ void TestBase::RunPerfTestBody()
             implConf.GetImpl();
 #endif
     }
-    catch(SkipTestException&)
+    catch(const SkipTestException&)
     {
         metrics.terminationReason = performance_metrics::TERM_SKIP_TEST;
         return;
     }
-    catch(PerfSkipTestException&)
+    catch(const PerfSkipTestException&)
     {
         metrics.terminationReason = performance_metrics::TERM_SKIP_TEST;
         return;
     }
-    catch(PerfEarlyExitException&)
+    catch(const PerfEarlyExitException&)
     {
         metrics.terminationReason = performance_metrics::TERM_INTERRUPT;
         return;//no additional failure logging
     }
-    catch(cv::Exception& e)
+    catch(const cv::Exception& e)
     {
         metrics.terminationReason = performance_metrics::TERM_EXCEPTION;
         #ifdef HAVE_CUDA
@@ -2011,7 +2011,7 @@ void TestBase::RunPerfTestBody()
         #endif
         FAIL() << "Expected: PerfTestBody() doesn't throw an exception.\n  Actual: it throws cv::Exception:\n  " << e.what();
     }
-    catch(std::exception& e)
+    catch(const std::exception& e)
     {
         metrics.terminationReason = performance_metrics::TERM_EXCEPTION;
         FAIL() << "Expected: PerfTestBody() doesn't throw an exception.\n  Actual: it throws std::exception:\n  " << e.what();
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 946648864d..27ba56b909 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -1189,11 +1189,11 @@ namespace
         prevImg.swapHandle(); nextImg.swapHandle();
 #endif
         }
-        catch (RuntimeError & e)
+        catch (const RuntimeError & e)
         {
             VX_DbgThrow(e.what());
         }
-        catch (WrapperError & e)
+        catch (const WrapperError & e)
         {
             VX_DbgThrow(e.what());
         }
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 138eca09bf..5007246332 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -169,8 +169,10 @@ enum VideoCaptureProperties {
        CAP_PROP_AUTOFOCUS     =39,
        CAP_PROP_SAR_NUM       =40, //!< Sample aspect ratio: num/den (num)
        CAP_PROP_SAR_DEN       =41, //!< Sample aspect ratio: num/den (den)
-       CAP_PROP_BACKEND       =42, //!< current backend (enum VideoCaptureAPIs). Read-only property
-       CAP_CROSSBAR_INPIN_TYPE =43, //!<CrossBar input pin Setting
+       CAP_PROP_BACKEND       =42, //!< Current backend (enum VideoCaptureAPIs). Read-only property
+       CAP_PROP_CHANNEL       =43, //!< Video input or Channel Number (only for those cameras that support)
+       CAP_PROP_AUTO_WB       =44, //!< enable/ disable auto white-balance
+       CAP_PROP_WB_TEMPERATURE=45, //!< white-balance color temperature
 #ifndef CV_DOXYGEN
        CV__CAP_PROP_LATEST
 #endif
diff --git a/modules/videoio/src/cap.cpp b/modules/videoio/src/cap.cpp
index 6b3e62ad96..473609d2b7 100644
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@@ -110,7 +110,7 @@ bool  VideoCapture::open(int cameraNum, int apiPreference)
 
     if (isOpened()) release();
 
-    if(apiPreference==CAP_ANY)
+    if (apiPreference == CAP_ANY)
     {
         // interpret preferred interface (0 = autodetect)
         int backendID = (cameraNum / 100) * 100;
diff --git a/modules/videoio/src/cap_dshow.cpp b/modules/videoio/src/cap_dshow.cpp
index e7e7a5c0e0..f6ca6a7329 100644
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@@ -141,6 +141,10 @@ DEFINE_GUID(MEDIASUBTYPE_Y8, 0x20203859, 0x0000, 0x0010, 0x80, 0x00,
     0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71);
 DEFINE_GUID(MEDIASUBTYPE_Y800, 0x30303859, 0x0000, 0x0010, 0x80, 0x00,
     0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71);
+DEFINE_GUID(MEDIASUBTYPE_Y16, 0x20363159, 0x0000, 0x0010, 0x80, 0x00,
+    0x00, 0xAA, 0x00, 0x38, 0x9B, 0x71);
+DEFINE_GUID(MEDIASUBTYPE_BY8, 0x20385942, 0x0000, 0x0010, 0x80, 0x00,
+    0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71);
 
 DEFINE_GUID(CLSID_CaptureGraphBuilder2,0xbf87b6e1,0x8c27,0x11d0,0xb3,0xf0,0x00,0xaa,0x00,0x37,0x61,0xc5);
 DEFINE_GUID(CLSID_FilterGraph,0xe436ebb3,0x524f,0x11ce,0x9f,0x53,0x00,0x20,0xaf,0x0b,0xa7,0x70);
@@ -333,7 +337,7 @@ static void DebugPrintOut(const char *format, ...)
 //videoInput defines
 #define VI_VERSION      0.1995
 #define VI_MAX_CAMERAS  20
-#define VI_NUM_TYPES    20 //MGB
+#define VI_NUM_TYPES    22 //MGB
 #define VI_NUM_FORMATS  18 //DON'T TOUCH
 
 //defines for setPhyCon - tuner is not as well supported as composite and s-video
@@ -427,6 +431,7 @@ class videoDevice{
         bool setupStarted;
         bool specificFormat;
         bool autoReconnect;
+        bool convertRGB;
         int  nFramesForReconnect;
         unsigned long nFramesRunning;
         int  connection;
@@ -522,6 +527,10 @@ class videoInput{
         int  getFourcc(int deviceID) const;
         double getFPS(int deviceID) const;
 
+        // RGB conversion setting
+        bool getConvertRGB(int deviceID);
+        bool setConvertRGB(int deviceID, bool enable);
+
         //completely stops and frees a device
         void stopDevice(int deviceID);
 
@@ -539,11 +548,13 @@ class videoInput{
 
         int property_window_count(int device_idx);
 
+        GUID getMediasubtype(int deviceID);
+
     private:
         void setPhyCon(int deviceID, int conn);
         void setAttemptCaptureSize(int deviceID, int w, int h,GUID mediaType=MEDIASUBTYPE_RGB24);
         bool setup(int deviceID);
-        void processPixels(unsigned char * src, unsigned char * dst, int width, int height, bool bRGB, bool bFlip);
+        void processPixels(unsigned char * src, unsigned char * dst, int width, int height, bool bRGB, bool bFlip, int bytesperpixel = 3);
         int  start(int deviceID, videoDevice * VD);
         int  getDeviceCount();
         void getMediaSubtypeAsString(GUID type, char * typeAsString);
@@ -586,6 +597,24 @@ class videoInput{
 
 ///////////////////////////  HANDY FUNCTIONS  /////////////////////////////
 
+//Included by e-con
+//Checks whether the current formattype is single byte format
+//Eg: MEDIASUBTYPE_Y800, MEDIASUBTYPE_Y8, MEDIASUBTYPE_GREY
+static bool checkSingleByteFormat(GUID formatType)
+{
+
+    if (formatType == MEDIASUBTYPE_Y800 ||
+        formatType == MEDIASUBTYPE_Y8 ||
+        formatType == MEDIASUBTYPE_GREY)
+    {
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
 static void MyFreeMediaType(AM_MEDIA_TYPE& mt){
     if (mt.cbFormat != 0)
     {
@@ -761,6 +790,7 @@ videoDevice::videoDevice(){
      setupStarted       = false;
      specificFormat     = false;
      autoReconnect      = false;
+     convertRGB         = true;
      requestedFrameTime = -1;
 
      pBuffer = 0;
@@ -788,7 +818,20 @@ void videoDevice::setSize(int w, int h){
     {
         width               = w;
         height              = h;
-        videoSize           = w*h*3;
+
+        if (checkSingleByteFormat(pAmMediaType->subtype))
+        {
+            videoSize      = w * h;
+        }
+        else if (pAmMediaType->subtype == MEDIASUBTYPE_Y16)
+        {
+            videoSize      = w * h * 2;
+        }
+        else
+        {
+            videoSize      = w * h * 3;
+        }
+
         sizeSet             = true;
         pixels              = new unsigned char[videoSize];
         pBuffer             = new char[videoSize];
@@ -1060,6 +1103,8 @@ videoInput::videoInput(){
     mediaSubtypes[17]    = MEDIASUBTYPE_Y8;
     mediaSubtypes[18]    = MEDIASUBTYPE_GREY;
     mediaSubtypes[19]    = MEDIASUBTYPE_I420;
+    mediaSubtypes[20] = MEDIASUBTYPE_BY8;
+    mediaSubtypes[21] = MEDIASUBTYPE_Y16;
 
     //The video formats we support
     formatTypes[VI_NTSC_M]      = AnalogVideo_NTSC_M;
@@ -1181,6 +1226,9 @@ bool videoInput::setupDeviceFourcc(int deviceNumber, int w, int h,int fourcc){
         GUID *mediaType = getMediaSubtypeFromFourcc(fourcc);
         if ( mediaType ) {
             setAttemptCaptureSize(deviceNumber,w,h,*mediaType);
+        } else {
+            DebugPrintOut("SETUP: Unknown GUID \n");
+            return false;
         }
     } else {
         setAttemptCaptureSize(deviceNumber,w,h);
@@ -1448,6 +1496,37 @@ int videoInput::getSize(int id) const
 
 }
 
+// ----------------------------------------------------------------------
+//
+//
+// ----------------------------------------------------------------------
+
+bool videoInput::getConvertRGB(int id)
+{
+    if (isDeviceSetup(id))
+    {
+        return VDList[id]->convertRGB;
+    }
+    else
+    {
+        return false;
+    }
+
+}
+
+bool videoInput::setConvertRGB(int id, bool enable)
+{
+    if (isDeviceSetup(id))
+    {
+        VDList[id]->convertRGB = enable;
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
 
 // ----------------------------------------------------------------------
 // Uses a supplied buffer
@@ -1472,7 +1551,24 @@ bool videoInput::getPixels(int id, unsigned char * dstBuffer, bool flipRedAndBlu
                 int height             = VDList[id]->height;
                 int width              = VDList[id]->width;
 
-                processPixels(src, dst, width, height, flipRedAndBlue, flipImage);
+                // Conditional processing for 8/16-bit images (e-Con systems)
+                if (checkSingleByteFormat(VDList[id]->pAmMediaType->subtype))
+                {
+                    memcpy(dst, src, width * height);
+                }
+                else if (VDList[id]->pAmMediaType->subtype == MEDIASUBTYPE_Y16)
+                {
+                    if (!VDList[id]->convertRGB) {
+                        memcpy(dst, src, width * height * 2);
+                    }
+                    else {
+                        processPixels(src, dst, width, height, flipRedAndBlue, flipImage, 2);
+                    }
+                }
+                else
+                {
+                    processPixels(src, dst, width, height, flipRedAndBlue, flipImage);
+                }
                 VDList[id]->sgCallback->newFrame = false;
 
             LeaveCriticalSection(&VDList[id]->sgCallback->critSection);
@@ -2112,62 +2208,81 @@ bool videoInput::setup(int deviceNumber){
 // You have any combination of those.
 // ----------------------------------------------------------------------
 
-void videoInput::processPixels(unsigned char * src, unsigned char * dst, int width, int height, bool bRGB, bool bFlip){
+void videoInput::processPixels(unsigned char * src, unsigned char * dst, int width, int height, bool bRGB, bool bFlip, int bytesperpixel){
 
-    int widthInBytes = width * 3;
+    int widthInBytes = width * bytesperpixel;
     int numBytes = widthInBytes * height;
 
-    if(!bRGB){
+    if (bytesperpixel == 2) {
+        for (int i = 0; i < width*height; i++) {
+            if (bytesperpixel == 2) {
+                *dst = (uint8_t) (*((uint16_t*) src) >> 8);
+                dst++;
+
+                *dst = (uint8_t) (*((uint16_t*)src) >> 8);
+                dst++;
 
-        //int x = 0;
-        //int y = 0;
+                *dst = (uint8_t) (*((uint16_t*)src) >> 8);
+                dst++;
 
-        if(bFlip){
-            for(int y = 0; y < height; y++){
-                memcpy(dst + (y * widthInBytes), src + ( (height -y -1) * widthInBytes), widthInBytes);
+                src += 2;
             }
-
-        }else{
-            memcpy(dst, src, numBytes);
         }
-    }else{
-        if(bFlip){
+    }
+    else
+    {
+        if(!bRGB){
 
-            int x = 0;
-            int y = (height - 1) * widthInBytes;
-            src += y;
+            //int x = 0;
+            //int y = 0;
 
-            for(int i = 0; i < numBytes; i+=3){
-                if(x >= width){
-                    x = 0;
-                    src -= widthInBytes*2;
+            if(bFlip){
+                for(int y = 0; y < height; y++){
+                    memcpy(dst + (y * widthInBytes), src + ( (height -y -1) * widthInBytes), widthInBytes);
                 }
 
-                *dst = *(src+2);
-                dst++;
+            }else{
+                memcpy(dst, src, numBytes);
+            }
+        }else{
+            if(bFlip){
 
-                *dst = *(src+1);
-                dst++;
+                int x = 0;
+                int y = (height - 1) * widthInBytes;
+                src += y;
 
-                *dst = *src;
-                dst++;
+                for(int i = 0; i < numBytes; i+=3){
+                    if(x >= width){
+                        x = 0;
+                        src -= widthInBytes*2;
+                    }
+
+                    *dst = *(src+2);
+                    dst++;
+
+                    *dst = *(src+1);
+                    dst++;
 
-                src+=3;
-                x++;
+                    *dst = *src;
+                    dst++;
+
+                    src+=3;
+                    x++;
+                }
             }
-        }
-        else{
-            for(int i = 0; i < numBytes; i+=3){
-                *dst = *(src+2);
-                dst++;
+            else{
+                for(int i = 0; i < numBytes; i+=3){
+                    *dst = *(src+2);
+                    dst++;
 
-                *dst = *(src+1);
-                dst++;
+                    *dst = *(src+1);
+                    dst++;
 
-                *dst = *src;
-                dst++;
+                    *dst = *src;
+                    dst++;
 
-                src+=3;
+                    src+=3;
+                }
             }
         }
     }
@@ -2198,6 +2313,8 @@ void videoInput::getMediaSubtypeAsString(GUID type, char * typeAsString){
     else if(type == MEDIASUBTYPE_Y8)    sprintf(tmpStr, "Y8");
     else if(type == MEDIASUBTYPE_GREY)  sprintf(tmpStr, "GREY");
     else if(type == MEDIASUBTYPE_I420)  sprintf(tmpStr, "I420");
+    else if (type == MEDIASUBTYPE_BY8)  sprintf(tmpStr, "BY8");
+    else if (type == MEDIASUBTYPE_Y16)  sprintf(tmpStr, "Y16");
     else sprintf(tmpStr, "OTHER");
 
     memcpy(typeAsString, tmpStr, sizeof(char)*8);
@@ -2339,6 +2456,10 @@ void videoInput::getCameraPropertyAsString(int prop, char * propertyAsString){
     memcpy(propertyAsString, tmpStr, sizeof(char)*16);
 }
 
+GUID videoInput::getMediasubtype(int deviceID)
+{
+    return VDList[deviceID]->pAmMediaType->subtype;
+}
 
 //-------------------------------------------------------------------------------------------
 static void findClosestSizeAndSubtype(videoDevice * VD, int widthIn, int heightIn, int &widthOut, int &heightOut, GUID & mediatypeOut){
@@ -2729,7 +2850,17 @@ int videoInput::start(int deviceID, videoDevice *VD){
     ZeroMemory(&mt,sizeof(AM_MEDIA_TYPE));
 
     mt.majortype     = MEDIATYPE_Video;
-    mt.subtype         = MEDIASUBTYPE_RGB24;
+
+    // Disable format conversion if using 8/16-bit data (e-Con systems)
+    if (checkSingleByteFormat(VD->pAmMediaType->subtype) || (VD->pAmMediaType->subtype == MEDIASUBTYPE_Y16)) {
+        DebugPrintOut("SETUP: Not converting frames to RGB.\n");
+        mt.subtype = VD->pAmMediaType->subtype;
+    }
+    else
+    {
+        DebugPrintOut("SETUP: Converting frames to RGB.\n");
+        mt.subtype = MEDIASUBTYPE_RGB24;	//Making it RGB24, does conversion from YUV to RGB
+    }
     mt.formattype     = FORMAT_VideoInfo;
 
     //VD->pAmMediaType->subtype = VD->videoType;
@@ -3270,15 +3401,22 @@ bool VideoCapture_DShow::setProperty(int propIdx, double propVal)
 
     case CV_CAP_PROP_FOURCC:
         m_fourcc = (int)(unsigned long)(propVal);
+        m_width = (int)getProperty(CAP_PROP_FRAME_WIDTH);
+        m_height = (int)getProperty(CAP_PROP_FRAME_HEIGHT);
+
         if (-1 == m_fourcc)
         {
             // following cvCreateVideo usage will pop up caprturepindialog here if fourcc=-1
             // TODO - how to create a capture pin dialog
         }
-        handled = true;
+        else
+        {
+            handled = true;
+        }
+
         break;
 
-    case CAP_CROSSBAR_INPIN_TYPE:
+    case CAP_PROP_CHANNEL:
 
         if (cvFloor(propVal) < 0)
             break;
@@ -3312,6 +3450,12 @@ bool VideoCapture_DShow::setProperty(int propIdx, double propVal)
         }
         return g_VI.setVideoSettingCamera(m_index, CameraControl_Focus, currentFocus, enabled ? CameraControl_Flags_Auto | CameraControl_Flags_Manual : CameraControl_Flags_Manual, enabled ? true : false);
     }
+
+    case CV_CAP_PROP_CONVERT_RGB:
+    {
+        return g_VI.setConvertRGB(m_index, cvRound(propVal) == 1);
+    }
+
     }
 
     if (handled)
@@ -3319,7 +3463,7 @@ bool VideoCapture_DShow::setProperty(int propIdx, double propVal)
         // a stream setting
         if (m_width > 0 && m_height > 0)
         {
-            if (m_width != g_VI.getWidth(m_index) || m_height != g_VI.getHeight(m_index) )//|| fourcc != VI.getFourcc(index) )
+            if (m_width != g_VI.getWidth(m_index) || m_height != g_VI.getHeight(m_index) || m_fourcc != g_VI.getFourcc(m_index) )
             {
                 int fps = static_cast<int>(g_VI.getFPS(m_index));
                 g_VI.stopDevice(m_index);
@@ -3330,10 +3474,14 @@ bool VideoCapture_DShow::setProperty(int propIdx, double propVal)
             bool success = g_VI.isDeviceSetup(m_index);
             if (success)
             {
+                DebugPrintOut("SETUP: Updated FourCC\n");
                 m_widthSet = m_width;
                 m_heightSet = m_height;
                 m_width = m_height = m_fourcc = -1;
             }
+            else {
+                DebugPrintOut("SETUP: Couldn't update FourCC\n");
+            }
             return success;
         }
         return true;
@@ -3383,7 +3531,18 @@ bool VideoCapture_DShow::grabFrame()
 }
 bool VideoCapture_DShow::retrieveFrame(int, OutputArray frame)
 {
-    frame.create(Size(g_VI.getWidth(m_index), g_VI.getHeight(m_index)), CV_8UC3);
+    int w = g_VI.getWidth(m_index), h = g_VI.getHeight(m_index);
+    bool convertRGB = g_VI.getConvertRGB(m_index);
+
+    // Set suitable output matrix type (e-Con systems)
+    if (checkSingleByteFormat(g_VI.getMediasubtype(m_index))){
+        frame.create(Size(w, h), CV_8UC1);
+    } else if (g_VI.getMediasubtype(m_index) == MEDIASUBTYPE_Y16 && !convertRGB) {
+        frame.create(Size(w, h), CV_16UC1);
+    } else {
+        frame.create(Size(w, h), CV_8UC3);
+    }
+
     cv::Mat mat = frame.getMat();
     return g_VI.getPixels(m_index, mat.ptr(), false, true );
 }
diff --git a/modules/videoio/src/cap_gphoto2.cpp b/modules/videoio/src/cap_gphoto2.cpp
index cab67b2b6d..d207cbab1c 100644
--- a/modules/videoio/src/cap_gphoto2.cpp
+++ b/modules/videoio/src/cap_gphoto2.cpp
@@ -70,7 +70,7 @@ public:
         return gp_result_as_string(result);
     }
     friend std::ostream & operator<<(std::ostream & ostream,
-            GPhoto2Exception & e)
+            const GPhoto2Exception & e)
     {
         return ostream << e.method << ": " << e.what();
     }
@@ -336,7 +336,7 @@ void DigitalCameraCapture::initContext()
         CR(gp_camera_autodetect(allDevices, context));
         CR(numDevices = gp_list_count(allDevices));
     }
-    catch (GPhoto2Exception & e)
+    catch (const GPhoto2Exception & e)
     {
         numDevices = 0;
     }
@@ -389,7 +389,7 @@ DigitalCameraCapture::~DigitalCameraCapture()
         gp_context_unref(context);
         context = NULL;
     }
-    catch (GPhoto2Exception & e)
+    catch (const GPhoto2Exception & e)
     {
         message(ERROR, "destruction error", e);
     }
@@ -442,7 +442,7 @@ bool DigitalCameraCapture::open(int index)
         opened = true;
         return true;
     }
-    catch (GPhoto2Exception & e)
+    catch (const GPhoto2Exception & e)
     {
         message(WARNING, "opening device failed", e);
         return false;
@@ -491,7 +491,7 @@ void DigitalCameraCapture::close()
             rootWidget = NULL;
         }
     }
-    catch (GPhoto2Exception & e)
+    catch (const GPhoto2Exception & e)
     {
         message(ERROR, "cannot close device properly", e);
     }
@@ -664,7 +664,7 @@ double DigitalCameraCapture::getProperty(int propertyId) const
             }
         }
     }
-    catch (GPhoto2Exception & e)
+    catch (const GPhoto2Exception & e)
     {
         char buf[128] = "";
         sprintf(buf, "cannot get property: %d", propertyId);
@@ -807,7 +807,7 @@ bool DigitalCameraCapture::setProperty(int propertyId, double value)
             CR(gp_widget_set_changed(widget, 0));
         }
     }
-    catch (GPhoto2Exception & e)
+    catch (const GPhoto2Exception & e)
     {
         char buf[128] = "";
         sprintf(buf, "cannot set property: %d to %f", propertyId, value);
@@ -849,7 +849,7 @@ bool DigitalCameraCapture::grabFrame()
         capturedFrames++;
         grabbedFrames.push_back(file);
     }
-    catch (GPhoto2Exception & e)
+    catch (const GPhoto2Exception & e)
     {
         if (file)
             gp_file_unref(file);
@@ -873,7 +873,7 @@ bool DigitalCameraCapture::retrieveFrame(int, OutputArray outputFrame)
             readFrameFromFile(file, outputFrame);
             CR(gp_file_unref(file));
         }
-        catch (GPhoto2Exception & e)
+        catch (const GPhoto2Exception & e)
         {
             message(WARNING, "cannot read file grabbed from device", e);
             return false;
@@ -914,7 +914,7 @@ int DigitalCameraCapture::findDevice(const char * deviceName) const
             }
         }
     }
-    catch (GPhoto2Exception & e)
+    catch (const GPhoto2Exception & e)
     {
         ; // pass
     }
@@ -980,7 +980,7 @@ CameraWidget * DigitalCameraCapture::findWidgetByName(
             }
             return (it != end) ? it->second : NULL;
         }
-        catch (GPhoto2Exception & e)
+        catch (const GPhoto2Exception & e)
         {
             message(WARNING, "error while searching for widget", e);
         }
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index 246e18acae..4a628f961c 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -242,10 +242,8 @@ make & enjoy!
 #define DEFAULT_V4L_HEIGHT 480
 #define DEFAULT_V4L_FPS 30
 
-#define CHANNEL_NUMBER 1
 #define MAX_CAMERAS 8
 
-
 // default and maximum number of V4L buffers, not including last, 'special' buffer
 #define MAX_V4L_BUFFERS 10
 #define DEFAULT_V4L_BUFFERS 4
@@ -253,16 +251,22 @@ make & enjoy!
 // if enabled, then bad JPEG warnings become errors and cause NULL returned instead of image
 #define V4L_ABORT_BADJPEG
 
-#define MAX_DEVICE_DRIVER_NAME 80
-
 namespace cv {
 
 /* Device Capture Objects */
 /* V4L2 structure */
-struct buffer
+struct Buffer
 {
     void *  start;
     size_t  length;
+    // This is dequeued buffer. It used for to put it back in the queue.
+    // The buffer is valid only if capture->bufferIndex >= 0
+    v4l2_buffer buffer;
+
+    Buffer() : start(NULL), length(0)
+    {
+        buffer = v4l2_buffer();
+    }
 };
 
 struct CvCaptureCAM_V4L CV_FINAL : public CvCapture
@@ -271,10 +275,9 @@ struct CvCaptureCAM_V4L CV_FINAL : public CvCapture
 
     int deviceHandle;
     int bufferIndex;
-    int FirstCapture;
+    bool FirstCapture;
     String deviceName;
 
-    char *memoryMap;
     IplImage frame;
 
     __u32 palette;
@@ -285,149 +288,163 @@ struct CvCaptureCAM_V4L CV_FINAL : public CvCapture
     bool convert_rgb;
     bool frame_allocated;
     bool returnFrame;
+    // To select a video input set cv::CAP_PROP_CHANNEL to channel number.
+    // If the new channel number is than 0, then a video input will not change
+    int channelNumber;
+    // Normalize properties. If set parameters will be converted to/from [0,1) range.
+    // Enabled by default (as OpenCV 3.x does).
+    // Value is initialized from the environment variable `OPENCV_VIDEOIO_V4L_RANGE_NORMALIZED`:
+    // To select real parameters mode after devise is open set cv::CAP_PROP_MODE to 0
+    // any other value revert the backward compatibility mode (with normalized properties).
+    // Range normalization affects the following parameters:
+    // cv::CAP_PROP_*: BRIGHTNESS,CONTRAST,SATURATION,HUE,GAIN,EXPOSURE,FOCUS,AUTOFOCUS,AUTO_EXPOSURE.
+    bool normalizePropRange;
 
     /* V4L2 variables */
-    buffer buffers[MAX_V4L_BUFFERS + 1];
-    v4l2_capability cap;
-    v4l2_input inp;
+    Buffer buffers[MAX_V4L_BUFFERS + 1];
+    v4l2_capability capability;
+    v4l2_input videoInput;
     v4l2_format form;
-    v4l2_crop crop;
-    v4l2_cropcap cropcap;
     v4l2_requestbuffers req;
     v4l2_buf_type type;
-    v4l2_queryctrl queryctrl;
 
     timeval timestamp;
 
-    /* V4L2 control variables */
-    Range focus, brightness, contrast, saturation, hue, gain, exposure;
-
     bool open(int _index);
     bool open(const char* deviceName);
+    bool isOpened() const;
 
     virtual double getProperty(int) const CV_OVERRIDE;
     virtual bool setProperty(int, double) CV_OVERRIDE;
     virtual bool grabFrame() CV_OVERRIDE;
     virtual IplImage* retrieveFrame(int) CV_OVERRIDE;
 
-    Range getRange(int property_id) const {
-        switch (property_id) {
-        case CV_CAP_PROP_BRIGHTNESS:
-            return brightness;
-        case CV_CAP_PROP_CONTRAST:
-            return contrast;
-        case CV_CAP_PROP_SATURATION:
-            return saturation;
-        case CV_CAP_PROP_HUE:
-            return hue;
-        case CV_CAP_PROP_GAIN:
-            return gain;
-        case CV_CAP_PROP_EXPOSURE:
-            return exposure;
-        case CV_CAP_PROP_FOCUS:
-            return focus;
-        case CV_CAP_PROP_AUTOFOCUS:
-            return Range(0, 1);
-        case CV_CAP_PROP_AUTO_EXPOSURE:
-            return Range(0, 4);
-        default:
-            return Range(0, 255);
-        }
-    }
-
+    CvCaptureCAM_V4L();
     virtual ~CvCaptureCAM_V4L();
+    bool requestBuffers();
+    bool requestBuffers(unsigned int buffer_number);
+    bool createBuffers();
+    void releaseBuffers();
+    bool initCapture();
+    bool streaming(bool startStream);
+    bool setFps(int value);
+    bool tryIoctl(unsigned long ioctlCode, void *parameter) const;
+    bool controlInfo(int property_id, __u32 &v4l2id, cv::Range &range) const;
+    bool icvControl(__u32 v4l2id, int &value, bool isSet) const;
+
+    bool icvSetFrameSize(int _width, int _height);
+    bool v4l2_reset();
+    bool setVideoInputChannel();
+    bool try_palette_v4l2();
+    bool try_init_v4l2();
+    bool autosetup_capture_mode_v4l2();
+    void v4l2_create_frame();
+    bool read_frame_v4l2();
+    bool convertableToRgb() const;
+    void convertToRgb(const Buffer &currentBuffer);
+    void releaseFrame();
 };
 
-static void icvCloseCAM_V4L( CvCaptureCAM_V4L* capture );
-
-static bool icvGrabFrameCAM_V4L( CvCaptureCAM_V4L* capture );
-static IplImage* icvRetrieveFrameCAM_V4L( CvCaptureCAM_V4L* capture, int );
-
-static double icvGetPropertyCAM_V4L( const CvCaptureCAM_V4L* capture, int property_id );
-static int    icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture, int property_id, double value );
-
 /***********************   Implementations  ***************************************/
 
+CvCaptureCAM_V4L::CvCaptureCAM_V4L() : deviceHandle(-1), bufferIndex(-1)
+{}
+
 CvCaptureCAM_V4L::~CvCaptureCAM_V4L() {
-    icvCloseCAM_V4L(this);
+    streaming(false);
+    releaseBuffers();
+    if(deviceHandle != -1)
+        close(deviceHandle);
 }
 
-static bool try_palette_v4l2(CvCaptureCAM_V4L* capture)
+bool CvCaptureCAM_V4L::isOpened() const
 {
-    capture->form = v4l2_format();
-    capture->form.type                = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    capture->form.fmt.pix.pixelformat = capture->palette;
-    capture->form.fmt.pix.field       = V4L2_FIELD_ANY;
-    capture->form.fmt.pix.width       = capture->width;
-    capture->form.fmt.pix.height      = capture->height;
-
-    if (-1 == ioctl (capture->deviceHandle, VIDIOC_S_FMT, &capture->form))
+    return deviceHandle != -1;
+}
+
+bool CvCaptureCAM_V4L::try_palette_v4l2()
+{
+    form = v4l2_format();
+    form.type                = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    form.fmt.pix.pixelformat = palette;
+    form.fmt.pix.field       = V4L2_FIELD_ANY;
+    form.fmt.pix.width       = width;
+    form.fmt.pix.height      = height;
+
+    if (!tryIoctl(VIDIOC_S_FMT, &form))
         return false;
 
-    return capture->palette == capture->form.fmt.pix.pixelformat;
+    return palette == form.fmt.pix.pixelformat;
 }
 
-static int try_init_v4l2(CvCaptureCAM_V4L* capture, const char *deviceName)
+bool CvCaptureCAM_V4L::setVideoInputChannel()
 {
-    // Test device for V4L2 compatibility
-    // Return value:
-    // -1 then unable to open device
-    //  0 then detected nothing
-    //  1 then V4L2 device
+    if(channelNumber < 0)
+        return true;
+    /* Query channels number */
+    int channel = 0;
+    if (!tryIoctl(VIDIOC_G_INPUT, &channel))
+        return false;
 
-    int deviceIndex;
+    if(channel == channelNumber)
+        return true;
 
-    /* Open and test V4L2 device */
-    capture->deviceHandle = open (deviceName, O_RDWR /* required */ | O_NONBLOCK, 0);
-    if (-1 == capture->deviceHandle)
-    {
-#ifndef NDEBUG
-        fprintf(stderr, "(DEBUG) try_init_v4l2 open \"%s\": %s\n", deviceName, strerror(errno));
-#endif
-        icvCloseCAM_V4L(capture);
-        return -1;
-    }
+    /* Query information about new input channel */
+    videoInput = v4l2_input();
+    videoInput.index = channelNumber;
+    if (!tryIoctl(VIDIOC_ENUMINPUT, &videoInput))
+        return false;
+
+    //To select a video input applications store the number of the desired input in an integer
+    // and call the VIDIOC_S_INPUT ioctl with a pointer to this integer. Side effects are possible.
+    // For example inputs may support different video standards, so the driver may implicitly
+    // switch the current standard.
+    // It is good practice to select an input before querying or negotiating any other parameters.
+    return tryIoctl(VIDIOC_S_INPUT, &channelNumber);
+}
+
+bool CvCaptureCAM_V4L::try_init_v4l2()
+{
+    /* The following code sets the CHANNEL_NUMBER of the video input.  Some video sources
+    have sub "Channel Numbers".  For a typical V4L TV capture card, this is usually 1.
+    I myself am using a simple NTSC video input capture card that uses the value of 1.
+    If you are not in North America or have a different video standard, you WILL have to change
+    the following settings and recompile/reinstall.  This set of settings is based on
+    the most commonly encountered input video source types (like my bttv card) */
 
-    capture->cap = v4l2_capability();
-    if (-1 == ioctl (capture->deviceHandle, VIDIOC_QUERYCAP, &capture->cap))
+    // The cv::CAP_PROP_MODE used for set the video input channel number
+    if (!setVideoInputChannel())
     {
 #ifndef NDEBUG
-        fprintf(stderr, "(DEBUG) try_init_v4l2 VIDIOC_QUERYCAP \"%s\": %s\n", deviceName, strerror(errno));
+        fprintf(stderr, "(DEBUG) V4L2: Unable to set Video Input Channel.");
 #endif
-        icvCloseCAM_V4L(capture);
-        return 0;
+        return false;
     }
 
-    /* Query channels number */
-    if (-1 == ioctl (capture->deviceHandle, VIDIOC_G_INPUT, &deviceIndex))
+    // Test device for V4L2 compatibility
+    capability = v4l2_capability();
+    if (!tryIoctl(VIDIOC_QUERYCAP, &capability))
     {
 #ifndef NDEBUG
-        fprintf(stderr, "(DEBUG) try_init_v4l2 VIDIOC_G_INPUT \"%s\": %s\n", deviceName, strerror(errno));
+        fprintf(stderr, "(DEBUG) V4L2: Unable to query capability.");
 #endif
-        icvCloseCAM_V4L(capture);
-        return 0;
+        return false;
     }
 
-    /* Query information about current input */
-    capture->inp = v4l2_input();
-    capture->inp.index = deviceIndex;
-    if (-1 == ioctl (capture->deviceHandle, VIDIOC_ENUMINPUT, &capture->inp))
+    if ((capability.capabilities & V4L2_CAP_VIDEO_CAPTURE) == 0)
     {
-#ifndef NDEBUG
-        fprintf(stderr, "(DEBUG) try_init_v4l2 VIDIOC_ENUMINPUT \"%s\": %s\n", deviceName, strerror(errno));
-#endif
-        icvCloseCAM_V4L(capture);
-        return 0;
+        /* Nope. */
+        fprintf(stderr, "VIDEOIO ERROR: V4L2: Unable to capture video memory.");
+        return false;
     }
-
-    return 1;
-
+    return true;
 }
 
-static int autosetup_capture_mode_v4l2(CvCaptureCAM_V4L* capture) {
+bool CvCaptureCAM_V4L::autosetup_capture_mode_v4l2()
+{
     //in case palette is already set and works, no need to setup.
-    if(capture->palette != 0 and try_palette_v4l2(capture)){
-        return 0;
+    if (palette != 0 && try_palette_v4l2()) {
+        return true;
     }
     __u32 try_order[] = {
             V4L2_PIX_FMT_BGR24,
@@ -437,6 +454,8 @@ static int autosetup_capture_mode_v4l2(CvCaptureCAM_V4L* capture) {
             V4L2_PIX_FMT_YUV411P,
             V4L2_PIX_FMT_YUYV,
             V4L2_PIX_FMT_UYVY,
+            V4L2_PIX_FMT_NV12,
+            V4L2_PIX_FMT_NV21,
             V4L2_PIX_FMT_SBGGR8,
             V4L2_PIX_FMT_SGBRG8,
             V4L2_PIX_FMT_SN9C10X,
@@ -449,313 +468,257 @@ static int autosetup_capture_mode_v4l2(CvCaptureCAM_V4L* capture) {
     };
 
     for (size_t i = 0; i < sizeof(try_order) / sizeof(__u32); i++) {
-        capture->palette = try_order[i];
-        if (try_palette_v4l2(capture)) {
-            return 0;
+        palette = try_order[i];
+        if (try_palette_v4l2()) {
+            return true;
         }
     }
-
-    fprintf(stderr,
-            "VIDEOIO ERROR: V4L2: Pixel format of incoming image is unsupported by OpenCV\n");
-    icvCloseCAM_V4L(capture);
-    return -1;
+    return false;
 }
 
-static void v4l2_control_range(CvCaptureCAM_V4L* cap, __u32 id)
+bool CvCaptureCAM_V4L::setFps(int value)
 {
-    cap->queryctrl= v4l2_queryctrl();
-    cap->queryctrl.id = id;
-
-    if(0 != ioctl(cap->deviceHandle, VIDIOC_QUERYCTRL, &cap->queryctrl))
-    {
-        if (errno != EINVAL)
-            perror ("VIDIOC_QUERYCTRL");
-        return;
-    }
-
-    if (cap->queryctrl.flags & V4L2_CTRL_FLAG_DISABLED)
-        return;
+    if (!isOpened())
+        return false;
 
-    Range range(cap->queryctrl.minimum, cap->queryctrl.maximum);
+    v4l2_streamparm streamparm = v4l2_streamparm();
+    streamparm.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    streamparm.parm.capture.timeperframe.numerator = 1;
+    streamparm.parm.capture.timeperframe.denominator = __u32(value);
+    if (!tryIoctl(VIDIOC_S_PARM, &streamparm) || !tryIoctl(VIDIOC_G_PARM, &streamparm))
+        return false;
 
-    switch(cap->queryctrl.id) {
-    case V4L2_CID_BRIGHTNESS:
-        cap->brightness = range;
-        break;
-    case V4L2_CID_CONTRAST:
-        cap->contrast = range;
-        break;
-    case V4L2_CID_SATURATION:
-        cap->saturation = range;
-        break;
-    case V4L2_CID_HUE:
-        cap->hue = range;
-        break;
-    case V4L2_CID_GAIN:
-        cap->gain = range;
-        break;
-    case V4L2_CID_EXPOSURE_ABSOLUTE:
-        cap->exposure = range;
-        break;
-    case V4L2_CID_FOCUS_ABSOLUTE:
-        cap->focus = range;
-        break;
-    }
+    fps = streamparm.parm.capture.timeperframe.denominator;
+    return true;
 }
 
-static void v4l2_scan_controls(CvCaptureCAM_V4L* capture)
+bool CvCaptureCAM_V4L::convertableToRgb() const
 {
-
-    __u32 ctrl_id;
-
-    for (ctrl_id = V4L2_CID_BASE; ctrl_id < V4L2_CID_LASTP1; ctrl_id++)
-    {
-        v4l2_control_range(capture, ctrl_id);
-    }
-
-    for (ctrl_id = V4L2_CID_PRIVATE_BASE;;ctrl_id++)
-    {
-        errno = 0;
-
-        v4l2_control_range(capture, ctrl_id);
-
-        if (errno)
-            break;
-    }
-
-    v4l2_control_range(capture, V4L2_CID_FOCUS_ABSOLUTE);
-}
-
-static int v4l2_set_fps(CvCaptureCAM_V4L* capture) {
-    v4l2_streamparm setfps = v4l2_streamparm();
-    setfps.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    setfps.parm.capture.timeperframe.numerator = 1;
-    setfps.parm.capture.timeperframe.denominator = capture->fps;
-    return ioctl (capture->deviceHandle, VIDIOC_S_PARM, &setfps);
-}
-
-static int v4l2_num_channels(__u32 palette) {
-    switch(palette) {
+    switch (palette) {
     case V4L2_PIX_FMT_YVU420:
     case V4L2_PIX_FMT_YUV420:
+    case V4L2_PIX_FMT_NV12:
+    case V4L2_PIX_FMT_NV21:
+    case V4L2_PIX_FMT_YUV411P:
+#ifdef HAVE_JPEG
     case V4L2_PIX_FMT_MJPEG:
     case V4L2_PIX_FMT_JPEG:
-    case V4L2_PIX_FMT_Y16:
-    case V4L2_PIX_FMT_GREY:
-        return 1;
+#endif
     case V4L2_PIX_FMT_YUYV:
     case V4L2_PIX_FMT_UYVY:
-        return 2;
-    case V4L2_PIX_FMT_BGR24:
+    case V4L2_PIX_FMT_SBGGR8:
+    case V4L2_PIX_FMT_SN9C10X:
+    case V4L2_PIX_FMT_SGBRG8:
     case V4L2_PIX_FMT_RGB24:
-        return 3;
+    case V4L2_PIX_FMT_Y16:
+    case V4L2_PIX_FMT_GREY:
+    case V4L2_PIX_FMT_BGR24:
+        return true;
     default:
-        return 0;
+        break;
     }
+    return false;
 }
 
-static void v4l2_create_frame(CvCaptureCAM_V4L *capture) {
-    CvSize size = {capture->form.fmt.pix.width, capture->form.fmt.pix.height};
+void CvCaptureCAM_V4L::v4l2_create_frame()
+{
+    CvSize size = {form.fmt.pix.width, form.fmt.pix.height};
     int channels = 3;
     int depth = IPL_DEPTH_8U;
 
-    if (!capture->convert_rgb) {
-        channels = v4l2_num_channels(capture->palette);
-
-        switch(capture->palette) {
-        case V4L2_PIX_FMT_MJPEG:
-        case V4L2_PIX_FMT_JPEG:
-            size = cvSize(capture->buffers[capture->bufferIndex].length, 1);
+    if (!convert_rgb) {
+        switch (palette) {
+        case V4L2_PIX_FMT_BGR24:
+        case V4L2_PIX_FMT_RGB24:
+            break;
+        case V4L2_PIX_FMT_YUYV:
+        case V4L2_PIX_FMT_UYVY:
+            channels = 2;
             break;
         case V4L2_PIX_FMT_YVU420:
         case V4L2_PIX_FMT_YUV420:
+        case V4L2_PIX_FMT_NV12:
+        case V4L2_PIX_FMT_NV21:
+            channels = 1;
             size.height = size.height * 3 / 2; // "1.5" channels
             break;
         case V4L2_PIX_FMT_Y16:
-            if(!capture->convert_rgb){
-                depth = IPL_DEPTH_16U;
-            }
+            depth = IPL_DEPTH_16U;
+            /* fallthru */
+        case V4L2_PIX_FMT_GREY:
+            channels = 1;
+            break;
+        case V4L2_PIX_FMT_MJPEG:
+        case V4L2_PIX_FMT_JPEG:
+        default:
+            channels = 1;
+            if(bufferIndex < 0)
+                size = cvSize(buffers[MAX_V4L_BUFFERS].length, 1);
+            else
+                size = cvSize(buffers[bufferIndex].buffer.bytesused, 1);
             break;
         }
     }
 
     /* Set up Image data */
-    cvInitImageHeader(&capture->frame, size, depth, channels);
+    cvInitImageHeader(&frame, size, depth, channels);
 
     /* Allocate space for pixelformat we convert to.
      * If we do not convert frame is just points to the buffer
      */
-    if(capture->convert_rgb) {
-        capture->frame.imageData = (char*)cvAlloc(capture->frame.imageSize);
+    releaseFrame();
+    // we need memory iff convert_rgb is true
+    if (convert_rgb) {
+        frame.imageData = (char *)cvAlloc(frame.imageSize);
+        frame_allocated = true;
     }
-
-    capture->frame_allocated = capture->convert_rgb;
 }
 
-static int _capture_V4L2 (CvCaptureCAM_V4L *capture)
+bool CvCaptureCAM_V4L::initCapture()
 {
-    const char* deviceName = capture->deviceName.c_str();
-    if (try_init_v4l2(capture, deviceName) != 1) {
-        /* init of the v4l2 device is not OK */
-        return -1;
-    }
-
-    /* V4L2 control variables are zero (memset above) */
-
-    /* Scan V4L2 controls */
-    v4l2_scan_controls(capture);
+    if (!isOpened())
+        return false;
 
-    if ((capture->cap.capabilities & V4L2_CAP_VIDEO_CAPTURE) == 0) {
-        /* Nope. */
-        fprintf( stderr, "VIDEOIO ERROR: V4L2: device %s is unable to capture video memory.\n",deviceName);
-        icvCloseCAM_V4L(capture);
-        return -1;
+    if (!try_init_v4l2()) {
+#ifndef NDEBUG
+        fprintf(stderr, " try_init_v4l2 open \"%s\": %s\n", deviceName.c_str(), strerror(errno));
+#endif
+        return false;
     }
 
-    /* The following code sets the CHANNEL_NUMBER of the video input.  Some video sources
-    have sub "Channel Numbers".  For a typical V4L TV capture card, this is usually 1.
-    I myself am using a simple NTSC video input capture card that uses the value of 1.
-    If you are not in North America or have a different video standard, you WILL have to change
-    the following settings and recompile/reinstall.  This set of settings is based on
-    the most commonly encountered input video source types (like my bttv card) */
-
-    if(capture->inp.index > 0) {
-        capture->inp = v4l2_input();
-        capture->inp.index = CHANNEL_NUMBER;
-        /* Set only channel number to CHANNEL_NUMBER */
-        /* V4L2 have a status field from selected video mode */
-        if (-1 == ioctl (capture->deviceHandle, VIDIOC_ENUMINPUT, &capture->inp))
-        {
-            fprintf (stderr, "VIDEOIO ERROR: V4L2: Aren't able to set channel number\n");
-            icvCloseCAM_V4L (capture);
-            return -1;
-        }
-    } /* End if */
-
     /* Find Window info */
-    capture->form = v4l2_format();
-    capture->form.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    form = v4l2_format();
+    form.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
 
-    if (-1 == ioctl (capture->deviceHandle, VIDIOC_G_FMT, &capture->form)) {
-        fprintf( stderr, "VIDEOIO ERROR: V4L2: Could not obtain specifics of capture window.\n\n");
-        icvCloseCAM_V4L(capture);
-        return -1;
+    if (!tryIoctl(VIDIOC_G_FMT, &form)) {
+        fprintf( stderr, "VIDEOIO ERROR: V4L2: Could not obtain specifics of capture window.\n");
+        return false;
     }
 
-    if (autosetup_capture_mode_v4l2(capture) == -1)
-        return -1;
+    if (!autosetup_capture_mode_v4l2()) {
+        fprintf(stderr, "VIDEOIO ERROR: V4L2: Pixel format of incoming image is unsupported by OpenCV\n");
+        return false;
+    }
 
     /* try to set framerate */
-    v4l2_set_fps(capture);
+    setFps(fps);
 
     unsigned int min;
 
     /* Buggy driver paranoia. */
-    min = capture->form.fmt.pix.width * 2;
+    min = form.fmt.pix.width * 2;
 
-    if (capture->form.fmt.pix.bytesperline < min)
-        capture->form.fmt.pix.bytesperline = min;
+    if (form.fmt.pix.bytesperline < min)
+        form.fmt.pix.bytesperline = min;
 
-    min = capture->form.fmt.pix.bytesperline * capture->form.fmt.pix.height;
+    min = form.fmt.pix.bytesperline * form.fmt.pix.height;
 
-    if (capture->form.fmt.pix.sizeimage < min)
-        capture->form.fmt.pix.sizeimage = min;
+    if (form.fmt.pix.sizeimage < min)
+        form.fmt.pix.sizeimage = min;
 
-    capture->req = v4l2_requestbuffers();
+    if (!requestBuffers())
+        return false;
 
-    unsigned int buffer_number = capture->bufferSize;
+    if (!createBuffers()) {
+        /* free capture, and returns an error code */
+        releaseBuffers();
+        return false;
+    }
 
-try_again:
+    v4l2_create_frame();
 
-    capture->req.count = buffer_number;
-    capture->req.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-    capture->req.memory = V4L2_MEMORY_MMAP;
+    // reinitialize buffers
+    FirstCapture = true;
 
-    if (-1 == ioctl (capture->deviceHandle, VIDIOC_REQBUFS, &capture->req))
-    {
-        if (EINVAL == errno)
-        {
-            fprintf (stderr, "%s does not support memory mapping\n", deviceName);
-        } else {
-            perror ("VIDIOC_REQBUFS");
-        }
-        /* free capture, and returns an error code */
-        icvCloseCAM_V4L (capture);
-        return -1;
+    return true;
+};
+
+bool CvCaptureCAM_V4L::requestBuffers()
+{
+    unsigned int buffer_number = bufferSize;
+    while (buffer_number > 0) {
+        if (!requestBuffers(buffer_number))
+            return false;
+        if (req.count >= buffer_number)
+            break;
+
+        buffer_number--;
+        fprintf(stderr, "Insufficient buffer memory on %s -- decreasing buffers\n", deviceName.c_str());
+    }
+    if (buffer_number < 1) {
+        fprintf(stderr, "Insufficient buffer memory on %s\n", deviceName.c_str());
+        return false;
     }
+    bufferSize = req.count;
+    return true;
+}
 
-    if (capture->req.count < buffer_number)
-    {
-        if (buffer_number == 1)
-        {
-            fprintf (stderr, "Insufficient buffer memory on %s\n", deviceName);
+bool CvCaptureCAM_V4L::requestBuffers(unsigned int buffer_number)
+{
+    if (!isOpened())
+        return false;
 
-            /* free capture, and returns an error code */
-            icvCloseCAM_V4L (capture);
-            return -1;
-        } else {
-            buffer_number--;
-            fprintf (stderr, "Insufficient buffer memory on %s -- decreaseing buffers\n", deviceName);
+    req = v4l2_requestbuffers();
+    req.count = buffer_number;
+    req.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    req.memory = V4L2_MEMORY_MMAP;
 
-            goto try_again;
+    if (!tryIoctl(VIDIOC_REQBUFS, &req)) {
+        if (EINVAL == errno) {
+            fprintf(stderr, "%s does not support memory mapping\n", deviceName.c_str());
+        } else {
+            perror("VIDIOC_REQBUFS");
         }
+        return false;
     }
+    return true;
+}
 
-    for (unsigned int n_buffers = 0; n_buffers < capture->req.count; ++n_buffers)
-    {
+bool CvCaptureCAM_V4L::createBuffers()
+{
+    size_t maxLength = 0;
+    for (unsigned int n_buffers = 0; n_buffers < req.count; ++n_buffers) {
         v4l2_buffer buf = v4l2_buffer();
         buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
         buf.memory = V4L2_MEMORY_MMAP;
         buf.index = n_buffers;
 
-        if (-1 == ioctl (capture->deviceHandle, VIDIOC_QUERYBUF, &buf)) {
-            perror ("VIDIOC_QUERYBUF");
-
-            /* free capture, and returns an error code */
-            icvCloseCAM_V4L (capture);
-            return -1;
+        if (!tryIoctl(VIDIOC_QUERYBUF, &buf)) {
+            perror("VIDIOC_QUERYBUF");
+            return false;
         }
 
-        capture->buffers[n_buffers].length = buf.length;
-        capture->buffers[n_buffers].start =
-                mmap (NULL /* start anywhere */,
-                        buf.length,
-                        PROT_READ | PROT_WRITE /* required */,
-                        MAP_SHARED /* recommended */,
-                        capture->deviceHandle, buf.m.offset);
-
-        if (MAP_FAILED == capture->buffers[n_buffers].start) {
-            perror ("mmap");
-
-            /* free capture, and returns an error code */
-            icvCloseCAM_V4L (capture);
-            return -1;
-        }
+        buffers[n_buffers].length = buf.length;
+        buffers[n_buffers].start =
+            mmap(NULL /* start anywhere */,
+                buf.length,
+                PROT_READ /* required */,
+                MAP_SHARED /* recommended */,
+                deviceHandle, buf.m.offset);
 
-        if (n_buffers == 0) {
-            capture->buffers[MAX_V4L_BUFFERS].start = malloc( buf.length );
-            capture->buffers[MAX_V4L_BUFFERS].length = buf.length;
+        if (MAP_FAILED == buffers[n_buffers].start) {
+            perror("mmap");
+            return false;
         }
+        maxLength = maxLength > buf.length ? maxLength : buf.length;
     }
-
-    v4l2_create_frame(capture);
-
-    // reinitialize buffers
-    capture->FirstCapture = 1;
-
-    return 1;
-}; /* End _capture_V4L2 */
+    if (maxLength > 0) {
+        buffers[MAX_V4L_BUFFERS].start = malloc(maxLength);
+        buffers[MAX_V4L_BUFFERS].length = maxLength;
+    }
+    return buffers[MAX_V4L_BUFFERS].start != 0;
+}
 
 /**
  * some properties can not be changed while the device is in streaming mode.
  * this method closes and re-opens the device to re-start the stream.
  * this also causes buffers to be reallocated if the frame size was changed.
  */
-static bool v4l2_reset( CvCaptureCAM_V4L* capture) {
-    String deviceName = capture->deviceName;
-    icvCloseCAM_V4L(capture);
-    capture->deviceName = deviceName;
-    return _capture_V4L2(capture) == 1;
+bool CvCaptureCAM_V4L::v4l2_reset()
+{
+    streaming(false);
+    releaseBuffers();
+    return initCapture();
 }
 
 bool CvCaptureCAM_V4L::open(int _index)
@@ -802,154 +765,127 @@ bool CvCaptureCAM_V4L::open(const char* _deviceName)
 #ifndef NDEBUG
     fprintf(stderr, "(DEBUG) V4L: opening %s\n", _deviceName);
 #endif
-    FirstCapture = 1;
+    FirstCapture = true;
     width = DEFAULT_V4L_WIDTH;
     height = DEFAULT_V4L_HEIGHT;
     width_set = height_set = 0;
     bufferSize = DEFAULT_V4L_BUFFERS;
     fps = DEFAULT_V4L_FPS;
     convert_rgb = true;
+    frame_allocated = false;
     deviceName = _deviceName;
     returnFrame = true;
+    normalizePropRange = utils::getConfigurationParameterBool("OPENCV_VIDEOIO_V4L_RANGE_NORMALIZED", true);
+    channelNumber = -1;
+    bufferIndex = -1;
+
+    deviceHandle = ::open(deviceName.c_str(), O_RDWR /* required */ | O_NONBLOCK, 0);
+    if (deviceHandle == -1)
+        return false;
 
-    return _capture_V4L2(this) == 1;
+    return initCapture();
 }
 
-static int read_frame_v4l2(CvCaptureCAM_V4L* capture) {
+bool CvCaptureCAM_V4L::read_frame_v4l2()
+{
     v4l2_buffer buf = v4l2_buffer();
-
     buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
     buf.memory = V4L2_MEMORY_MMAP;
 
-    if (-1 == ioctl (capture->deviceHandle, VIDIOC_DQBUF, &buf)) {
-        switch (errno) {
-        case EAGAIN:
-            return 0;
-
-        case EIO:
-            if (!(buf.flags & (V4L2_BUF_FLAG_QUEUED | V4L2_BUF_FLAG_DONE)))
-            {
-                if (ioctl(capture->deviceHandle, VIDIOC_QBUF, &buf) == -1)
-                {
-                    return 0;
-                }
-            }
-            return 0;
-
-        default:
-            /* display the error and stop processing */
-            capture->returnFrame = false;
-            perror ("VIDIOC_DQBUF");
-            return -1;
+    while (!tryIoctl(VIDIOC_DQBUF, &buf)) {
+        if (errno == EIO && !(buf.flags & (V4L2_BUF_FLAG_QUEUED | V4L2_BUF_FLAG_DONE))) {
+            // Maybe buffer not in the queue? Try to put there
+            if (!tryIoctl(VIDIOC_QBUF, &buf))
+                return false;
+            continue;
         }
+        /* display the error and stop processing */
+        returnFrame = false;
+        perror("VIDIOC_DQBUF");
+        return false;
     }
 
-    assert(buf.index < capture->req.count);
+    assert(buf.index < req.count);
+    assert(buffers[buf.index].length == buf.length);
 
-    memcpy(capture->buffers[MAX_V4L_BUFFERS].start,
-            capture->buffers[buf.index].start,
-            capture->buffers[MAX_V4L_BUFFERS].length );
-    capture->bufferIndex = MAX_V4L_BUFFERS;
-    //printf("got data in buff %d, len=%d, flags=0x%X, seq=%d, used=%d)\n",
-    //    buf.index, buf.length, buf.flags, buf.sequence, buf.bytesused);
+    //We shouldn't use this buffer in the queue while not retrieve frame from it.
+    buffers[buf.index].buffer = buf;
+    bufferIndex = buf.index;
 
     //set timestamp in capture struct to be timestamp of most recent frame
-    capture->timestamp = buf.timestamp;
-
-    if (-1 == ioctl (capture->deviceHandle, VIDIOC_QBUF, &buf))
-        perror ("VIDIOC_QBUF");
-
-    return 1;
+    timestamp = buf.timestamp;
+    return true;
 }
 
-static int mainloop_v4l2(CvCaptureCAM_V4L* capture) {
-    for (;;) {
-        fd_set fds;
-        struct timeval tv;
-        int r;
+bool CvCaptureCAM_V4L::tryIoctl(unsigned long ioctlCode, void *parameter) const
+{
+    while (-1 == ioctl(deviceHandle, ioctlCode, parameter)) {
+        if (!(errno == EBUSY || errno == EAGAIN))
+            return false;
 
-        FD_ZERO (&fds);
-        FD_SET (capture->deviceHandle, &fds);
+        fd_set fds;
+        FD_ZERO(&fds);
+        FD_SET(deviceHandle, &fds);
 
         /* Timeout. */
+        struct timeval tv;
         tv.tv_sec = 10;
         tv.tv_usec = 0;
 
-        r = select (capture->deviceHandle+1, &fds, NULL, NULL, &tv);
-
-        if (-1 == r) {
-            if (EINTR == errno)
-                continue;
-
-            perror ("select");
-        }
-
-        if (0 == r) {
-            fprintf (stderr, "select timeout\n");
-
-            /* end the infinite loop */
-            break;
+        int result = select(deviceHandle + 1, &fds, NULL, NULL, &tv);
+        if (0 == result) {
+            fprintf(stderr, "select timeout\n");
+            return false;
         }
-
-        int returnCode = read_frame_v4l2 (capture);
-        if(returnCode == -1)
-            return -1;
-        if(returnCode == 1)
-            return 1;
+        if (-1 == result && EINTR != errno)
+            perror("select");
     }
-    return 0;
+    return true;
 }
 
-static bool icvGrabFrameCAM_V4L(CvCaptureCAM_V4L* capture) {
-    if (capture->FirstCapture) {
+bool CvCaptureCAM_V4L::grabFrame()
+{
+    if (FirstCapture) {
         /* Some general initialization must take place the first time through */
 
         /* This is just a technicality, but all buffers must be filled up before any
          staggered SYNC is applied.  SO, filler up. (see V4L HowTo) */
+        bufferIndex = -1;
+        for (__u32 index = 0; index < req.count; ++index) {
+            v4l2_buffer buf = v4l2_buffer();
 
-        {
+            buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+            buf.memory = V4L2_MEMORY_MMAP;
+            buf.index = index;
 
-            for (capture->bufferIndex = 0;
-                    capture->bufferIndex < ((int)capture->req.count);
-                    ++capture->bufferIndex)
-            {
-
-                v4l2_buffer buf = v4l2_buffer();
-
-                buf.type        = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-                buf.memory      = V4L2_MEMORY_MMAP;
-                buf.index       = (unsigned long)capture->bufferIndex;
-
-                if (-1 == ioctl (capture->deviceHandle, VIDIOC_QBUF, &buf)) {
-                    perror ("VIDIOC_QBUF");
-                    return false;
-                }
-            }
-
-            /* enable the streaming */
-            capture->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-            if (-1 == ioctl (capture->deviceHandle, VIDIOC_STREAMON,
-                    &capture->type)) {
-                /* error enabling the stream */
-                perror ("VIDIOC_STREAMON");
+            if (!tryIoctl(VIDIOC_QBUF, &buf)) {
+                perror("VIDIOC_QBUF");
                 return false;
             }
         }
 
+        if(!streaming(true)) {
+            /* error enabling the stream */
+            perror("VIDIOC_STREAMON");
+            return false;
+        }
+
 #if defined(V4L_ABORT_BADJPEG)
         // skip first frame. it is often bad -- this is unnotied in traditional apps,
         //  but could be fatal if bad jpeg is enabled
-        if(mainloop_v4l2(capture) != 1)
+        if (!read_frame_v4l2())
             return false;
 #endif
 
         /* preparation is ok */
-        capture->FirstCapture = 0;
+        FirstCapture = false;
     }
-
-    if(mainloop_v4l2(capture) != 1) return false;
-
-    return true;
+    // In the case that the grab frame was without retrieveFrame
+    if (bufferIndex >= 0) {
+        if (!tryIoctl(VIDIOC_QBUF, &buffers[bufferIndex].buffer))
+            perror("VIDIOC_QBUF");
+    }
+    return read_frame_v4l2();
 }
 
 /*
@@ -1020,14 +956,6 @@ move_411_block(int yTL, int yTR, int yBL, int yBR, int u, int v,
     rgb[5] = LIMIT(r+yBR);
 }
 
-/* Converts from planar YUV420P to RGB24. */
-static inline void
-yuv420p_to_rgb24(int width, int height, uchar* src, uchar* dst, bool isYUV)
-{
-    cvtColor(Mat(height * 3 / 2, width, CV_8U, src), Mat(height, width, CV_8UC3, dst),
-            isYUV ? COLOR_YUV2BGR_IYUV : COLOR_YUV2BGR_YV12);
-}
-
 // Consider a YUV411P image of 8x2 pixels.
 //
 // A plane of Y values as before.
@@ -1074,47 +1002,6 @@ yuv411p_to_rgb24(int width, int height,
     }
 }
 
-/* convert from 4:2:2 YUYV interlaced to RGB24 */
-static void
-yuyv_to_rgb24(int width, int height, unsigned char* src, unsigned char* dst) {
-    cvtColor(Mat(height, width, CV_8UC2, src), Mat(height, width, CV_8UC3, dst),
-            COLOR_YUV2BGR_YUYV);
-}
-
-static inline void
-uyvy_to_rgb24 (int width, int height, unsigned char *src, unsigned char *dst)
-{
-    cvtColor(Mat(height, width, CV_8UC2, src), Mat(height, width, CV_8UC3, dst),
-            COLOR_YUV2BGR_UYVY);
-}
-
-static inline void
-y16_to_rgb24 (int width, int height, unsigned char* src, unsigned char* dst)
-{
-    Mat gray8;
-    Mat(height, width, CV_16UC1, src).convertTo(gray8, CV_8U, 0.00390625);
-    cvtColor(gray8,Mat(height, width, CV_8UC3, dst),COLOR_GRAY2BGR);
-}
-
-static inline void
-y8_to_rgb24 (int width, int height, unsigned char* src, unsigned char* dst)
-{
-    Mat gray8(height, width, CV_8UC1, src);
-    cvtColor(gray8,Mat(height, width, CV_8UC3, dst),COLOR_GRAY2BGR);
-}
-
-#ifdef HAVE_JPEG
-
-/* convert from mjpeg to rgb24 */
-static bool
-mjpeg_to_rgb24(int width, int height, unsigned char* src, int length, IplImage* dst) {
-    Mat temp = cvarrToMat(dst);
-    imdecode(Mat(1, length, CV_8U, src), IMREAD_COLOR, &temp);
-    return temp.data && temp.cols == width && temp.rows == height;
-}
-
-#endif
-
 /*
  * BAYER2RGB24 ROUTINE TAKEN FROM:
  *
@@ -1283,12 +1170,6 @@ static void sgbrg2rgb24(long int WIDTH, long int HEIGHT, unsigned char *src, uns
     }
 }
 
-static inline void
-rgb24_to_rgb24 (int width, int height, unsigned char *src, unsigned char *dst)
-{
-    cvtColor(Mat(height, width, CV_8UC3, src), Mat(height, width, CV_8UC3, dst), COLOR_RGB2BGR);
-}
-
 #define CLAMP(x)        ((x)<0?0:((x)>255)?255:(x))
 
 typedef struct {
@@ -1460,458 +1341,578 @@ static int sonix_decompress(int width, int height, unsigned char *inp, unsigned
     return 0;
 }
 
-static IplImage* icvRetrieveFrameCAM_V4L( CvCaptureCAM_V4L* capture, int) {
-    /* Now get what has already been captured as a IplImage return */
-    // we need memory iff convert_rgb is true
-    bool recreate_frame = capture->frame_allocated != capture->convert_rgb;
-
-    if (!capture->convert_rgb) {
-        // for mjpeg streams the size might change in between, so we have to change the header
-        recreate_frame += capture->frame.imageSize != (int)capture->buffers[capture->bufferIndex].length;
-    }
-
-    if(recreate_frame) {
-        // printf("realloc %d %zu\n", capture->frame.imageSize, capture->buffers[capture->bufferIndex].length);
-        if(capture->frame_allocated)
-            cvFree(&capture->frame.imageData);
-        v4l2_create_frame(capture);
-    }
+void CvCaptureCAM_V4L::convertToRgb(const Buffer &currentBuffer)
+{
+    cv::Size imageSize(form.fmt.pix.width, form.fmt.pix.height);
+    // Not found conversion
+    switch (palette)
+    {
+    case V4L2_PIX_FMT_YUV411P:
+        yuv411p_to_rgb24(imageSize.width, imageSize.height,
+                (unsigned char*)(currentBuffer.start),
+                (unsigned char*)frame.imageData);
+        return;
+    case V4L2_PIX_FMT_SBGGR8:
+        bayer2rgb24(imageSize.width, imageSize.height,
+                (unsigned char*)currentBuffer.start,
+                (unsigned char*)frame.imageData);
+        return;
 
-    if(!capture->convert_rgb) {
-        capture->frame.imageData = (char*)capture->buffers[capture->bufferIndex].start;
-        return &capture->frame;
-    }
+    case V4L2_PIX_FMT_SN9C10X:
+        sonix_decompress_init();
+        sonix_decompress(imageSize.width, imageSize.height,
+                (unsigned char*)currentBuffer.start,
+                (unsigned char*)buffers[MAX_V4L_BUFFERS].start);
 
-    switch (capture->palette)
-    {
-    case V4L2_PIX_FMT_BGR24:
-        memcpy((char *)capture->frame.imageData,
-                (char *)capture->buffers[capture->bufferIndex].start,
-                capture->frame.imageSize);
+        bayer2rgb24(imageSize.width, imageSize.height,
+                (unsigned char*)buffers[MAX_V4L_BUFFERS].start,
+                (unsigned char*)frame.imageData);
+        return;
+    case V4L2_PIX_FMT_SGBRG8:
+        sgbrg2rgb24(imageSize.width, imageSize.height,
+                (unsigned char*)currentBuffer.start,
+                (unsigned char*)frame.imageData);
+        return;
+    default:
         break;
-
+    }
+    // Converted by cvtColor or imdecode
+    cv::Mat destination(imageSize, CV_8UC3, frame.imageData);
+    switch (palette) {
     case V4L2_PIX_FMT_YVU420:
+        cv::cvtColor(cv::Mat(imageSize.height * 3 / 2, imageSize.width, CV_8U, currentBuffer.start), destination,
+                     COLOR_YUV2BGR_YV12);
+        return;
     case V4L2_PIX_FMT_YUV420:
-        yuv420p_to_rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)(capture->buffers[capture->bufferIndex].start),
-                (unsigned char*)capture->frame.imageData,
-                capture->palette == V4L2_PIX_FMT_YUV420);
-        break;
-
-    case V4L2_PIX_FMT_YUV411P:
-        yuv411p_to_rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)(capture->buffers[capture->bufferIndex].start),
-                (unsigned char*)capture->frame.imageData);
-        break;
+        cv::cvtColor(cv::Mat(imageSize.height * 3 / 2, imageSize.width, CV_8U, currentBuffer.start), destination,
+                     COLOR_YUV2BGR_IYUV);
+        return;
+    case V4L2_PIX_FMT_NV12:
+        cv::cvtColor(cv::Mat(imageSize.height * 3 / 2, imageSize.width, CV_8U, currentBuffer.start), destination,
+                     COLOR_YUV2RGB_NV12);
+        return;
+    case V4L2_PIX_FMT_NV21:
+        cv::cvtColor(cv::Mat(imageSize.height * 3 / 2, imageSize.width, CV_8U, currentBuffer.start), destination,
+                     COLOR_YUV2RGB_NV21);
+        return;
 #ifdef HAVE_JPEG
     case V4L2_PIX_FMT_MJPEG:
     case V4L2_PIX_FMT_JPEG:
-        if (!mjpeg_to_rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)(capture->buffers[capture->bufferIndex]
-                                                  .start),
-                                                  capture->buffers[capture->bufferIndex].length,
-                                                  &capture->frame))
-            return 0;
-        break;
+        cv::imdecode(Mat(1, currentBuffer.buffer.bytesused, CV_8U, currentBuffer.start), IMREAD_COLOR, &destination);
+        return;
 #endif
-
     case V4L2_PIX_FMT_YUYV:
-        yuyv_to_rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)(capture->buffers[capture->bufferIndex].start),
-                (unsigned char*)capture->frame.imageData);
-        break;
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC2, currentBuffer.start), destination, COLOR_YUV2BGR_YUYV);
+        return;
     case V4L2_PIX_FMT_UYVY:
-        uyvy_to_rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)(capture->buffers[capture->bufferIndex].start),
-                (unsigned char*)capture->frame.imageData);
-        break;
-    case V4L2_PIX_FMT_SBGGR8:
-        bayer2rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)capture->buffers[capture->bufferIndex].start,
-                (unsigned char*)capture->frame.imageData);
-        break;
-
-    case V4L2_PIX_FMT_SN9C10X:
-        sonix_decompress_init();
-        sonix_decompress(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)capture->buffers[capture->bufferIndex].start,
-                (unsigned char*)capture->buffers[(capture->bufferIndex+1) % capture->req.count].start);
-
-        bayer2rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)capture->buffers[(capture->bufferIndex+1) % capture->req.count].start,
-                (unsigned char*)capture->frame.imageData);
-        break;
-
-    case V4L2_PIX_FMT_SGBRG8:
-        sgbrg2rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)capture->buffers[(capture->bufferIndex+1) % capture->req.count].start,
-                (unsigned char*)capture->frame.imageData);
-        break;
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC2, currentBuffer.start), destination, COLOR_YUV2BGR_UYVY);
+        return;
     case V4L2_PIX_FMT_RGB24:
-        rgb24_to_rgb24(capture->form.fmt.pix.width,
-                capture->form.fmt.pix.height,
-                (unsigned char*)capture->buffers[(capture->bufferIndex+1) % capture->req.count].start,
-                (unsigned char*)capture->frame.imageData);
-        break;
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC3, currentBuffer.start), destination, COLOR_RGB2BGR);
+        return;
     case V4L2_PIX_FMT_Y16:
-        if(capture->convert_rgb){
-            y16_to_rgb24(capture->form.fmt.pix.width,
-                    capture->form.fmt.pix.height,
-                    (unsigned char*)capture->buffers[capture->bufferIndex].start,
-                    (unsigned char*)capture->frame.imageData);
-        }else{
-            memcpy((char *)capture->frame.imageData,
-                    (char *)capture->buffers[capture->bufferIndex].start,
-                    capture->frame.imageSize);
-        }
-        break;
+    {
+        cv::Mat temp(imageSize, CV_8UC1, buffers[MAX_V4L_BUFFERS].start);
+        cv::Mat(imageSize, CV_16UC1, currentBuffer.start).convertTo(temp, CV_8U, 1.0 / 256);
+        cv::cvtColor(temp, destination, COLOR_GRAY2BGR);
+        return;
+    }
     case V4L2_PIX_FMT_GREY:
-        if(capture->convert_rgb){
-            y8_to_rgb24(capture->form.fmt.pix.width,
-                    capture->form.fmt.pix.height,
-                    (unsigned char*)capture->buffers[capture->bufferIndex].start,
-                    (unsigned char*)capture->frame.imageData);
-        }else{
-            memcpy((char *)capture->frame.imageData,
-                    (char *)capture->buffers[capture->bufferIndex].start,
-                    capture->frame.imageSize);
-        }
+        cv::cvtColor(cv::Mat(imageSize, CV_8UC1, currentBuffer.start), destination, COLOR_GRAY2BGR);
+        break;
+    case V4L2_PIX_FMT_BGR24:
+    default:
+        memcpy((char *)frame.imageData, (char *)currentBuffer.start,
+               std::min(frame.imageSize, (int)currentBuffer.buffer.bytesused));
         break;
     }
+}
 
-    if (capture->returnFrame)
-        return(&capture->frame);
-    else
-        return 0;
+static inline cv::String capPropertyName(int prop)
+{
+    switch (prop) {
+    case cv::CAP_PROP_POS_MSEC:
+        return "pos_msec";
+    case cv::CAP_PROP_POS_FRAMES:
+        return "pos_frames";
+    case cv::CAP_PROP_POS_AVI_RATIO:
+        return "pos_avi_ratio";
+    case cv::CAP_PROP_FRAME_COUNT:
+        return "frame_count";
+    case cv::CAP_PROP_FRAME_HEIGHT:
+        return "height";
+    case cv::CAP_PROP_FRAME_WIDTH:
+        return "width";
+    case cv::CAP_PROP_CONVERT_RGB:
+        return "convert_rgb";
+    case cv::CAP_PROP_FORMAT:
+        return "format";
+    case cv::CAP_PROP_MODE:
+        return "mode";
+    case cv::CAP_PROP_FOURCC:
+        return "fourcc";
+    case cv::CAP_PROP_AUTO_EXPOSURE:
+        return "auto_exposure";
+    case cv::CAP_PROP_EXPOSURE:
+        return "exposure";
+    case cv::CAP_PROP_TEMPERATURE:
+        return "temperature";
+    case cv::CAP_PROP_FPS:
+        return "fps";
+    case cv::CAP_PROP_BRIGHTNESS:
+        return "brightness";
+    case cv::CAP_PROP_CONTRAST:
+        return "contrast";
+    case cv::CAP_PROP_SATURATION:
+        return "saturation";
+    case cv::CAP_PROP_HUE:
+        return "hue";
+    case cv::CAP_PROP_GAIN:
+        return "gain";
+    case cv::CAP_PROP_RECTIFICATION:
+        return "rectification";
+    case cv::CAP_PROP_MONOCHROME:
+        return "monochrome";
+    case cv::CAP_PROP_SHARPNESS:
+        return "sharpness";
+    case cv::CAP_PROP_GAMMA:
+        return "gamma";
+    case cv::CAP_PROP_TRIGGER:
+        return "trigger";
+    case cv::CAP_PROP_TRIGGER_DELAY:
+        return "trigger_delay";
+    case cv::CAP_PROP_WHITE_BALANCE_RED_V:
+        return "white_balance_red_v";
+    case cv::CAP_PROP_ZOOM:
+        return "zoom";
+    case cv::CAP_PROP_FOCUS:
+        return "focus";
+    case cv::CAP_PROP_GUID:
+        return "guid";
+    case cv::CAP_PROP_ISO_SPEED:
+        return "iso_speed";
+    case cv::CAP_PROP_BACKLIGHT:
+        return "backlight";
+    case cv::CAP_PROP_PAN:
+        return "pan";
+    case cv::CAP_PROP_TILT:
+        return "tilt";
+    case cv::CAP_PROP_ROLL:
+        return "roll";
+    case cv::CAP_PROP_IRIS:
+        return "iris";
+    case cv::CAP_PROP_SETTINGS:
+        return "dialog_settings";
+    case cv::CAP_PROP_BUFFERSIZE:
+        return "buffersize";
+    case cv::CAP_PROP_AUTOFOCUS:
+        return "autofocus";
+    case cv::CAP_PROP_WHITE_BALANCE_BLUE_U:
+        return "white_balance_blue_u";
+    case cv::CAP_PROP_SAR_NUM:
+        return "sar_num";
+    case cv::CAP_PROP_SAR_DEN:
+        return "sar_den";
+    case CAP_PROP_AUTO_WB:
+        return "auto wb";
+    case CAP_PROP_WB_TEMPERATURE:
+        return "wb temperature";
+    default:
+        return "unknown";
+    }
 }
 
-static inline __u32 capPropertyToV4L2(int prop) {
+static inline int capPropertyToV4L2(int prop)
+{
     switch (prop) {
-    case CV_CAP_PROP_BRIGHTNESS:
+    case cv::CAP_PROP_FPS:
+        return -1;
+    case cv::CAP_PROP_FOURCC:
+        return -1;
+    case cv::CAP_PROP_FRAME_COUNT:
+        return V4L2_CID_MPEG_VIDEO_B_FRAMES;
+    case cv::CAP_PROP_FORMAT:
+        return -1;
+    case cv::CAP_PROP_MODE:
+        return -1;
+    case cv::CAP_PROP_BRIGHTNESS:
         return V4L2_CID_BRIGHTNESS;
-    case CV_CAP_PROP_CONTRAST:
+    case cv::CAP_PROP_CONTRAST:
         return V4L2_CID_CONTRAST;
-    case CV_CAP_PROP_SATURATION:
+    case cv::CAP_PROP_SATURATION:
         return V4L2_CID_SATURATION;
-    case CV_CAP_PROP_HUE:
+    case cv::CAP_PROP_HUE:
         return V4L2_CID_HUE;
-    case CV_CAP_PROP_GAIN:
+    case cv::CAP_PROP_GAIN:
         return V4L2_CID_GAIN;
-    case CV_CAP_PROP_AUTO_EXPOSURE:
-        return V4L2_CID_EXPOSURE_AUTO;
-    case CV_CAP_PROP_EXPOSURE:
+    case cv::CAP_PROP_EXPOSURE:
         return V4L2_CID_EXPOSURE_ABSOLUTE;
-    case CV_CAP_PROP_AUTOFOCUS:
-        return V4L2_CID_FOCUS_AUTO;
-    case CV_CAP_PROP_FOCUS:
+    case cv::CAP_PROP_CONVERT_RGB:
+        return -1;
+    case cv::CAP_PROP_WHITE_BALANCE_BLUE_U:
+        return V4L2_CID_BLUE_BALANCE;
+    case cv::CAP_PROP_RECTIFICATION:
+        return -1;
+    case cv::CAP_PROP_MONOCHROME:
+        return -1;
+    case cv::CAP_PROP_SHARPNESS:
+        return V4L2_CID_SHARPNESS;
+    case cv::CAP_PROP_AUTO_EXPOSURE:
+        return V4L2_CID_EXPOSURE_AUTO;
+    case cv::CAP_PROP_GAMMA:
+        return V4L2_CID_GAMMA;
+    case cv::CAP_PROP_TEMPERATURE:
+        return V4L2_CID_WHITE_BALANCE_TEMPERATURE;
+    case cv::CAP_PROP_TRIGGER:
+        return -1;
+    case cv::CAP_PROP_TRIGGER_DELAY:
+        return -1;
+    case cv::CAP_PROP_WHITE_BALANCE_RED_V:
+        return V4L2_CID_RED_BALANCE;
+    case cv::CAP_PROP_ZOOM:
+        return V4L2_CID_ZOOM_ABSOLUTE;
+    case cv::CAP_PROP_FOCUS:
         return V4L2_CID_FOCUS_ABSOLUTE;
-    default:
+    case cv::CAP_PROP_GUID:
+        return -1;
+    case cv::CAP_PROP_ISO_SPEED:
+        return V4L2_CID_ISO_SENSITIVITY;
+    case cv::CAP_PROP_BACKLIGHT:
+        return V4L2_CID_BACKLIGHT_COMPENSATION;
+    case cv::CAP_PROP_PAN:
+        return V4L2_CID_PAN_ABSOLUTE;
+    case cv::CAP_PROP_TILT:
+        return V4L2_CID_TILT_ABSOLUTE;
+    case cv::CAP_PROP_ROLL:
+        return V4L2_CID_ROTATE;
+    case cv::CAP_PROP_IRIS:
+        return V4L2_CID_IRIS_ABSOLUTE;
+    case cv::CAP_PROP_SETTINGS:
         return -1;
+    case cv::CAP_PROP_BUFFERSIZE:
+        return -1;
+    case cv::CAP_PROP_AUTOFOCUS:
+        return V4L2_CID_FOCUS_AUTO;
+    case cv::CAP_PROP_SAR_NUM:
+        return V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_HEIGHT;
+    case cv::CAP_PROP_SAR_DEN:
+        return V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_WIDTH;
+    case CAP_PROP_AUTO_WB:
+        return V4L2_CID_AUTO_WHITE_BALANCE;
+    case CAP_PROP_WB_TEMPERATURE:
+        return V4L2_CID_WHITE_BALANCE_TEMPERATURE;
+    default:
+        break;
     }
+    return -1;
 }
 
-static double icvGetPropertyCAM_V4L (const CvCaptureCAM_V4L* capture,
-        int property_id ) {
-    {
-        v4l2_format form;
-        memset(&form, 0, sizeof(v4l2_format));
-        form.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-        if (-1 == ioctl (capture->deviceHandle, VIDIOC_G_FMT, &form)) {
-            /* display an error message, and return an error code */
-            perror ("VIDIOC_G_FMT");
-            return -1;
-        }
-
-        switch (property_id) {
-        case CV_CAP_PROP_FRAME_WIDTH:
-            return form.fmt.pix.width;
-        case CV_CAP_PROP_FRAME_HEIGHT:
-            return form.fmt.pix.height;
-        case CV_CAP_PROP_FOURCC:
-        case CV_CAP_PROP_MODE:
-            return capture->palette;
-        case CV_CAP_PROP_FORMAT:
-            return CV_MAKETYPE(IPL2CV_DEPTH(capture->frame.depth), capture->frame.nChannels);
-        case CV_CAP_PROP_CONVERT_RGB:
-            return capture->convert_rgb;
-        case CV_CAP_PROP_BUFFERSIZE:
-            return capture->bufferSize;
-        }
-
-        if(property_id == CV_CAP_PROP_FPS) {
-            v4l2_streamparm sp = v4l2_streamparm();
-            sp.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-            if (ioctl(capture->deviceHandle, VIDIOC_G_PARM, &sp) < 0){
-                fprintf(stderr, "VIDEOIO ERROR: V4L: Unable to get camera FPS\n");
-                return -1;
-            }
-
-            return sp.parm.capture.timeperframe.denominator / (double)sp.parm.capture.timeperframe.numerator;
-        }
-
-        /* initialize the control structure */
-
-        if(property_id == CV_CAP_PROP_POS_MSEC) {
-            if (capture->FirstCapture) {
-                return 0;
-            } else {
-                return 1000 * capture->timestamp.tv_sec + ((double) capture->timestamp.tv_usec) / 1000;
-            }
-        }
-
-        __u32 v4l2id = capPropertyToV4L2(property_id);
-
-        if(v4l2id == __u32(-1)) {
-            fprintf(stderr,
-                    "VIDEOIO ERROR: V4L2: getting property #%d is not supported\n",
-                    property_id);
-            return -1;
-        }
-
-        v4l2_control control = {v4l2id, 0};
-
-        if (-1 == ioctl (capture->deviceHandle, VIDIOC_G_CTRL,
-                &control)) {
-
-            fprintf( stderr, "VIDEOIO ERROR: V4L2: ");
-            switch (property_id) {
-            case CV_CAP_PROP_BRIGHTNESS:
-                fprintf (stderr, "Brightness");
-                break;
-            case CV_CAP_PROP_CONTRAST:
-                fprintf (stderr, "Contrast");
-                break;
-            case CV_CAP_PROP_SATURATION:
-                fprintf (stderr, "Saturation");
-                break;
-            case CV_CAP_PROP_HUE:
-                fprintf (stderr, "Hue");
-                break;
-            case CV_CAP_PROP_GAIN:
-                fprintf (stderr, "Gain");
-                break;
-            case CV_CAP_PROP_AUTO_EXPOSURE:
-                fprintf (stderr, "Auto Exposure");
-                break;
-            case CV_CAP_PROP_EXPOSURE:
-                fprintf (stderr, "Exposure");
-                break;
-            case CV_CAP_PROP_AUTOFOCUS:
-                fprintf (stderr, "Autofocus");
-                break;
-            case CV_CAP_PROP_FOCUS:
-                fprintf (stderr, "Focus");
-                break;
-            }
-            fprintf (stderr, " is not supported by your device\n");
-
-            return -1;
-        }
-
-        /* get the min/max values */
-        Range range = capture->getRange(property_id);
-
-        /* all was OK, so convert to 0.0 - 1.0 range, and return the value */
-        return ((double)control.value - range.start) / range.size();
-
-    }
-};
-
-static bool icvSetControl (CvCaptureCAM_V4L* capture,
-        int property_id, double value) {
-
-    /* limitation of the input value */
-    if (value < 0.0) {
-        value = 0.0;
-    } else if (value > 1.0) {
-        value = 1.0;
+static inline bool compatibleRange(int property_id)
+{
+    switch (property_id) {
+    case cv::CAP_PROP_BRIGHTNESS:
+    case cv::CAP_PROP_CONTRAST:
+    case cv::CAP_PROP_SATURATION:
+    case cv::CAP_PROP_HUE:
+    case cv::CAP_PROP_GAIN:
+    case cv::CAP_PROP_EXPOSURE:
+    case cv::CAP_PROP_FOCUS:
+    case cv::CAP_PROP_AUTOFOCUS:
+    case cv::CAP_PROP_AUTO_EXPOSURE:
+        return true;
+    default:
+        break;
     }
+    return false;
+}
 
+bool CvCaptureCAM_V4L::controlInfo(int property_id, __u32 &_v4l2id, cv::Range &range) const
+{
     /* initialisations */
-    __u32 v4l2id = capPropertyToV4L2(property_id);
-
-    if(v4l2id == __u32(-1)) {
-        fprintf(stderr,
-                "VIDEOIO ERROR: V4L2: setting property #%d is not supported\n",
-                property_id);
+    int v4l2id = capPropertyToV4L2(property_id);
+    v4l2_queryctrl queryctrl = v4l2_queryctrl();
+    queryctrl.id = __u32(v4l2id);
+    if (v4l2id == -1 || !tryIoctl(VIDIOC_QUERYCTRL, &queryctrl)) {
+        fprintf(stderr, "VIDEOIO ERROR: V4L2: property %s is not supported\n", capPropertyName(property_id).c_str());
         return false;
     }
+    _v4l2id = __u32(v4l2id);
+    range = cv::Range(queryctrl.minimum, queryctrl.maximum);
+    if (normalizePropRange) {
+        switch(property_id)
+        {
+        case CAP_PROP_WB_TEMPERATURE:
+        case CAP_PROP_AUTO_WB:
+        case CAP_PROP_AUTOFOCUS:
+            range = Range(0, 1); // do not convert
+            break;
+        case CAP_PROP_AUTO_EXPOSURE:
+            range = Range(0, 4);
+        default:
+            break;
+        }
+    }
+    return true;
+}
 
-    /* get the min/max values */
-    Range range = capture->getRange(property_id);
-
-    /* scale the value we want to set */
-    value = value * range.size() + range.start;
-
+bool CvCaptureCAM_V4L::icvControl(__u32 v4l2id, int &value, bool isSet) const
+{
     /* set which control we want to set */
-    v4l2_control control = {v4l2id, int(value)};
+    v4l2_control control = v4l2_control();
+    control.id = v4l2id;
+    control.value = value;
 
     /* The driver may clamp the value or return ERANGE, ignored here */
-    if (-1 == ioctl(capture->deviceHandle, VIDIOC_S_CTRL, &control) && errno != ERANGE) {
-        perror ("VIDIOC_S_CTRL");
+    if (!tryIoctl(isSet ? VIDIOC_S_CTRL : VIDIOC_G_CTRL, &control)) {
+        switch (errno) {
+#ifndef NDEBUG
+        case EINVAL:
+            fprintf(stderr,
+                "The struct v4l2_control id is invalid or the value is inappropriate for the given control (i.e. "
+                "if a menu item is selected that is not supported by the driver according to VIDIOC_QUERYMENU).");
+            break;
+        case ERANGE:
+            fprintf(stderr, "The struct v4l2_control value is out of bounds.");
+            break;
+        case EACCES:
+            fprintf(stderr, "Attempt to set a read-only control or to get a write-only control.");
+            break;
+#endif
+        default:
+            perror(isSet ? "VIDIOC_S_CTRL" : "VIDIOC_G_CTRL");
+            break;
+        }
         return false;
     }
+    if (!isSet)
+        value = control.value;
+    return true;
+}
 
-    if(control.id == V4L2_CID_EXPOSURE_AUTO && control.value == V4L2_EXPOSURE_MANUAL) {
-        // update the control range for expose after disabling autoexposure
-        // as it is not read correctly at startup
-        // TODO check this again as it might be fixed with Linux 4.5
-        v4l2_control_range(capture, V4L2_CID_EXPOSURE_ABSOLUTE);
+double CvCaptureCAM_V4L::getProperty(int property_id) const
+{
+    switch (property_id) {
+    case cv::CAP_PROP_FRAME_WIDTH:
+        return form.fmt.pix.width;
+    case cv::CAP_PROP_FRAME_HEIGHT:
+        return form.fmt.pix.height;
+    case cv::CAP_PROP_FOURCC:
+        return palette;
+    case cv::CAP_PROP_FORMAT:
+        return CV_MAKETYPE(IPL2CV_DEPTH(frame.depth), frame.nChannels);
+    case cv::CAP_PROP_MODE:
+        if (normalizePropRange)
+            return palette;
+        return normalizePropRange;
+    case cv::CAP_PROP_CONVERT_RGB:
+        return convert_rgb;
+    case cv::CAP_PROP_BUFFERSIZE:
+        return bufferSize;
+    case cv::CAP_PROP_FPS:
+    {
+        v4l2_streamparm sp = v4l2_streamparm();
+        sp.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+        if (!tryIoctl(VIDIOC_G_PARM, &sp)) {
+            fprintf(stderr, "VIDEOIO ERROR: V4L: Unable to get camera FPS\n");
+            return -1;
+        }
+        return sp.parm.capture.timeperframe.denominator / (double)sp.parm.capture.timeperframe.numerator;
     }
+    case cv::CAP_PROP_POS_MSEC:
+        if (FirstCapture)
+            return 0;
 
-    /* all was OK */
-    return true;
+        return 1000 * timestamp.tv_sec + ((double)timestamp.tv_usec) / 1000;
+    case cv::CAP_PROP_CHANNEL:
+        return channelNumber;
+    default:
+    {
+        cv::Range range;
+        __u32 v4l2id;
+        if(!controlInfo(property_id, v4l2id, range))
+            return -1.0;
+        int value = 0;
+        if(!icvControl(v4l2id, value, false))
+            return -1.0;
+        if (normalizePropRange && compatibleRange(property_id))
+            return ((double)value - range.start) / range.size();
+        return  value;
+    }
+    }
+    return -1.0;
 }
 
-static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture,
-        int property_id, double value ){
-    bool retval = false;
-    bool possible;
+bool CvCaptureCAM_V4L::icvSetFrameSize(int _width, int _height)
+{
+    if (_width > 0)
+        width_set = _width;
+
+    if (height > 0)
+        height_set = _height;
 
     /* two subsequent calls setting WIDTH and HEIGHT will change
        the video size */
+    if (width_set <= 0 || height_set <= 0)
+        return true;
+
+    width = width_set;
+    height = height_set;
+    width_set = height_set = 0;
+    return v4l2_reset();
+}
 
+bool CvCaptureCAM_V4L::setProperty( int property_id, double _value )
+{
+    int value = cvRound(_value);
     switch (property_id) {
-    case CV_CAP_PROP_FRAME_WIDTH:
-    {
-        int& width = capture->width_set;
-        int& height = capture->height_set;
-        width = cvRound(value);
-        retval = width != 0;
-        if(width !=0 && height != 0) {
-            capture->width = width;
-            capture->height = height;
-            retval = v4l2_reset(capture);
-            width = height = 0;
+    case cv::CAP_PROP_FRAME_WIDTH:
+        return icvSetFrameSize(value, 0);
+    case cv::CAP_PROP_FRAME_HEIGHT:
+        return icvSetFrameSize(0, value);
+    case cv::CAP_PROP_FPS:
+        if (fps == static_cast<__u32>(value))
+            return true;
+        return setFps(value);
+    case cv::CAP_PROP_CONVERT_RGB:
+        if (bool(value)) {
+            convert_rgb = convertableToRgb();
+            return convert_rgb;
         }
-    }
-    break;
-    case CV_CAP_PROP_FRAME_HEIGHT:
+        convert_rgb = false;
+        return true;
+    case cv::CAP_PROP_FOURCC:
     {
-        int& width = capture->width_set;
-        int& height = capture->height_set;
-        height = cvRound(value);
-        retval = height != 0;
-        if(width !=0 && height != 0) {
-            capture->width = width;
-            capture->height = height;
-            retval = v4l2_reset(capture);
-            width = height = 0;
-        }
+        if (palette == static_cast<__u32>(value))
+            return true;
+
+        __u32 old_palette = palette;
+        palette = static_cast<__u32>(value);
+        if (v4l2_reset())
+            return true;
+
+        palette = old_palette;
+        v4l2_reset();
+        return false;
     }
-    break;
-    case CV_CAP_PROP_FPS:
-        capture->fps = value;
-        retval = v4l2_reset(capture);
-        break;
-    case CV_CAP_PROP_CONVERT_RGB:
-        // returns "0" for formats we do not know how to map to IplImage
-        possible = v4l2_num_channels(capture->palette);
-        capture->convert_rgb = bool(value) && possible;
-        retval = possible || !bool(value);
-        break;
-    case CV_CAP_PROP_FOURCC:
+    case cv::CAP_PROP_MODE:
+        normalizePropRange = bool(value);
+        return true;
+    case cv::CAP_PROP_BUFFERSIZE:
+        if (bufferSize == value)
+            return true;
+
+        if (value > MAX_V4L_BUFFERS || value < 1) {
+            fprintf(stderr, "V4L: Bad buffer size %d, buffer size must be from 1 to %d\n", value, MAX_V4L_BUFFERS);
+            return false;
+        }
+        bufferSize = value;
+        return v4l2_reset();
+    case cv::CAP_PROP_CHANNEL:
     {
-        __u32 old_palette = capture->palette;
-        __u32 new_palette = static_cast<__u32>(value);
-        capture->palette = new_palette;
-        if (v4l2_reset(capture)) {
-            retval = true;
-        } else {
-            capture->palette = old_palette;
-            v4l2_reset(capture);
-            retval = false;
+        if (value < 0) {
+            channelNumber = -1;
+            return true;
         }
+        if (channelNumber == value)
+            return true;
+
+        int old_channel = channelNumber;
+        channelNumber = value;
+        if (v4l2_reset())
+            return true;
+
+        channelNumber = old_channel;
+        v4l2_reset();
+        return false;
     }
-    break;
-    case CV_CAP_PROP_BUFFERSIZE:
-        if ((int)value > MAX_V4L_BUFFERS || (int)value < 1) {
-            fprintf(stderr, "V4L: Bad buffer size %d, buffer size must be from 1 to %d\n", (int)value, MAX_V4L_BUFFERS);
-            retval = false;
-        } else {
-            capture->bufferSize = (int)value;
-            if (capture->bufferIndex > capture->bufferSize) {
-                capture->bufferIndex = 0;
-            }
-            retval = v4l2_reset(capture);
-        }
-        break;
     default:
-        retval = icvSetControl(capture, property_id, value);
-        break;
+    {
+        cv::Range range;
+        __u32 v4l2id;
+        if (!controlInfo(property_id, v4l2id, range))
+            return false;
+        if (normalizePropRange && compatibleRange(property_id))
+            value = cv::saturate_cast<int>(_value * range.size() + range.start);
+        return icvControl(v4l2id, value, true);
+    }
     }
+    return false;
+}
 
-    /* return the the status */
-    return retval;
+void CvCaptureCAM_V4L::releaseFrame()
+{
+    if (frame_allocated && frame.imageData) {
+        cvFree(&frame.imageData);
+        frame_allocated = false;
+    }
 }
 
-static void icvCloseCAM_V4L( CvCaptureCAM_V4L* capture ){
-    /* Deallocate space - Hopefully, no leaks */
+void CvCaptureCAM_V4L::releaseBuffers()
+{
+    releaseFrame();
 
-    if (!capture->deviceName.empty())
-    {
-        if (capture->deviceHandle != -1)
-        {
-            capture->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-            if (-1 == ioctl(capture->deviceHandle, VIDIOC_STREAMOFF, &capture->type)) {
-                perror ("Unable to stop the stream");
-            }
+    if (buffers[MAX_V4L_BUFFERS].start) {
+        free(buffers[MAX_V4L_BUFFERS].start);
+        buffers[MAX_V4L_BUFFERS].start = 0;
+    }
 
-            for (unsigned int n_buffers = 0; n_buffers < MAX_V4L_BUFFERS; ++n_buffers)
-            {
-                if (capture->buffers[n_buffers].start) {
-                    if (-1 == munmap (capture->buffers[n_buffers].start, capture->buffers[n_buffers].length)) {
-                        perror ("munmap");
-                    } else {
-                        capture->buffers[n_buffers].start = 0;
-                    }
-                }
-            }
+    bufferIndex = -1;
+    FirstCapture = true;
+    if (!isOpened())
+        return;
 
-            if (capture->buffers[MAX_V4L_BUFFERS].start)
-            {
-                free(capture->buffers[MAX_V4L_BUFFERS].start);
-                capture->buffers[MAX_V4L_BUFFERS].start = 0;
+    for (unsigned int n_buffers = 0; n_buffers < MAX_V4L_BUFFERS; ++n_buffers) {
+        if (buffers[n_buffers].start) {
+            if (-1 == munmap(buffers[n_buffers].start, buffers[n_buffers].length)) {
+                perror("munmap");
+            } else {
+                buffers[n_buffers].start = 0;
             }
         }
-
-        if (capture->deviceHandle != -1)
-            close(capture->deviceHandle);
-
-        if (capture->frame_allocated && capture->frame.imageData)
-            cvFree(&capture->frame.imageData);
-
-        capture->deviceName.clear(); // flag that the capture is closed
     }
+    //Applications can call ioctl VIDIOC_REQBUFS again to change the number of buffers,
+    // however this cannot succeed when any buffers are still mapped. A count value of zero
+    // frees all buffers, after aborting or finishing any DMA in progress, an implicit VIDIOC_STREAMOFF.
+    requestBuffers(0);
 };
 
-bool CvCaptureCAM_V4L::grabFrame()
+bool CvCaptureCAM_V4L::streaming(bool startStream)
 {
-    return icvGrabFrameCAM_V4L( this );
-}
+    if (!isOpened())
+        return !startStream;
 
-IplImage* CvCaptureCAM_V4L::retrieveFrame(int)
-{
-    return icvRetrieveFrameCAM_V4L( this, 0 );
+    type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    return tryIoctl(startStream ? VIDIOC_STREAMON : VIDIOC_STREAMOFF, &type);
 }
 
-double CvCaptureCAM_V4L::getProperty( int propId ) const
+IplImage *CvCaptureCAM_V4L::retrieveFrame(int)
 {
-    return icvGetPropertyCAM_V4L( this, propId );
-}
+    if (bufferIndex < 0)
+        return &frame;
 
-bool CvCaptureCAM_V4L::setProperty( int propId, double value )
-{
-    return icvSetPropertyCAM_V4L( this, propId, value );
+    /* Now get what has already been captured as a IplImage return */
+    const Buffer &currentBuffer = buffers[bufferIndex];
+    if (convert_rgb) {
+        if (!frame_allocated)
+            v4l2_create_frame();
+
+        convertToRgb(currentBuffer);
+    } else {
+        // for mjpeg streams the size might change in between, so we have to change the header
+        // We didn't allocate memory when not convert_rgb, but we have to recreate the header
+        if (frame.imageSize != (int)currentBuffer.buffer.bytesused)
+            v4l2_create_frame();
+
+        frame.imageData = (char *)buffers[MAX_V4L_BUFFERS].start;
+        memcpy(buffers[MAX_V4L_BUFFERS].start, currentBuffer.start,
+               std::min(buffers[MAX_V4L_BUFFERS].length, (size_t)currentBuffer.buffer.bytesused));
+    }
+    //Revert buffer to the queue
+    if (!tryIoctl(VIDIOC_QBUF, &buffers[bufferIndex].buffer))
+        perror("VIDIOC_QBUF");
+
+    bufferIndex = -1;
+    return &frame;
 }
 
 } // end namespace cv
diff --git a/modules/videoio/test/test_camera.cpp b/modules/videoio/test/test_camera.cpp
index ef66dca8d9..eb6fb60c52 100644
--- a/modules/videoio/test/test_camera.cpp
+++ b/modules/videoio/test/test_camera.cpp
@@ -11,16 +11,8 @@
 
 namespace opencv_test { namespace {
 
-TEST(DISABLED_VideoIO_Camera, basic)
+static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100)
 {
-    VideoCapture capture(0);
-    ASSERT_TRUE(capture.isOpened());
-    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
-    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
-    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
-    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
-
-    const int N = 100;
     Mat frame;
     int64 time0 = cv::getTickCount();
     for (int i = 0; i < N; i++)
@@ -34,7 +26,32 @@ TEST(DISABLED_VideoIO_Camera, basic)
     }
     int64 time1 = cv::getTickCount();
     printf("Processed %d frames on %.2f FPS\n", N, (N * cv::getTickFrequency()) / (time1 - time0 + 1));
+}
 
+TEST(DISABLED_VideoIO_Camera, basic)
+{
+    VideoCapture capture(0);
+    ASSERT_TRUE(capture.isOpened());
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    test_readFrames(capture);
+    capture.release();
+}
+
+TEST(DISABLED_VideoIO_Camera, validate_V4L2_MJPEG)
+{
+    VideoCapture capture(CAP_V4L2);
+    ASSERT_TRUE(capture.isOpened());
+    ASSERT_TRUE(capture.set(CAP_PROP_FOURCC, VideoWriter::fourcc('M', 'J', 'P', 'G')));
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    int fourcc = (int)capture.get(CAP_PROP_FOURCC);
+    std::cout << "FOURCC code: " << cv::format("0x%8x", fourcc) << std::endl;
+    test_readFrames(capture);
     capture.release();
 }
 
@@ -43,27 +60,12 @@ TEST(DISABLED_VideoIO_Camera, dshow_avermedia_capture)
 {
     VideoCapture capture(0);
     ASSERT_TRUE(capture.isOpened());
-    capture.set(CAP_CROSSBAR_INPIN_TYPE, 6);
+    capture.set(CAP_PROP_CHANNEL, 6);
     std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
     std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
     std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
     std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
-
-    const int N = 100;
-    Mat frame;
-    int64 time0 = cv::getTickCount();
-    for (int i = 0; i < N; i++)
-    {
-        SCOPED_TRACE(cv::format("frame=%d", i));
-
-        capture >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        EXPECT_GT(cvtest::norm(frame, NORM_INF), 0) << "Complete black image has been received";
-    }
-    int64 time1 = cv::getTickCount();
-    printf("Processed %d frames on %.2f FPS\n", N, (N * cv::getTickFrequency()) / (time1 - time0 + 1));
-
+    test_readFrames(capture);
     capture.release();
 }
 
diff --git a/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp b/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp
index 7d198dc53e..ef8c26f85b 100644
--- a/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp
+++ b/samples/android/face-detection/jni/DetectionBasedTracker_jni.cpp
@@ -88,7 +88,7 @@ JNIEXPORT jlong JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker
             //trackingDetector->setMinObjectSize(Size(faceSize, faceSize));
         }
     }
-    catch(cv::Exception& e)
+    catch(const cv::Exception& e)
     {
         LOGD("nativeCreateObject caught cv::Exception: %s", e.what());
         jclass je = jenv->FindClass("org/opencv/core/CvException");
@@ -121,7 +121,7 @@ JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_
             delete (DetectorAgregator*)thiz;
         }
     }
-    catch(cv::Exception& e)
+    catch(const cv::Exception& e)
     {
         LOGD("nativeestroyObject caught cv::Exception: %s", e.what());
         jclass je = jenv->FindClass("org/opencv/core/CvException");
@@ -147,7 +147,7 @@ JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_
     {
         ((DetectorAgregator*)thiz)->tracker->run();
     }
-    catch(cv::Exception& e)
+    catch(const cv::Exception& e)
     {
         LOGD("nativeStart caught cv::Exception: %s", e.what());
         jclass je = jenv->FindClass("org/opencv/core/CvException");
@@ -173,7 +173,7 @@ JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_
     {
         ((DetectorAgregator*)thiz)->tracker->stop();
     }
-    catch(cv::Exception& e)
+    catch(const cv::Exception& e)
     {
         LOGD("nativeStop caught cv::Exception: %s", e.what());
         jclass je = jenv->FindClass("org/opencv/core/CvException");
@@ -203,7 +203,7 @@ JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_
             //((DetectorAgregator*)thiz)->trackingDetector->setMinObjectSize(Size(faceSize, faceSize));
         }
     }
-    catch(cv::Exception& e)
+    catch(const cv::Exception& e)
     {
         LOGD("nativeStop caught cv::Exception: %s", e.what());
         jclass je = jenv->FindClass("org/opencv/core/CvException");
@@ -233,7 +233,7 @@ JNIEXPORT void JNICALL Java_org_opencv_samples_facedetect_DetectionBasedTracker_
         ((DetectorAgregator*)thiz)->tracker->getObjects(RectFaces);
         *((Mat*)faces) = Mat(RectFaces, true);
     }
-    catch(cv::Exception& e)
+    catch(const cv::Exception& e)
     {
         LOGD("nativeCreateObject caught cv::Exception: %s", e.what());
         jclass je = jenv->FindClass("org/opencv/core/CvException");
diff --git a/samples/android/tutorial-4-opencl/jni/CLprocessor.cpp b/samples/android/tutorial-4-opencl/jni/CLprocessor.cpp
index b71dc10182..27b878b3fa 100644
--- a/samples/android/tutorial-4-opencl/jni/CLprocessor.cpp
+++ b/samples/android/tutorial-4-opencl/jni/CLprocessor.cpp
@@ -63,11 +63,11 @@ void dumpCLinfo()
                   i, name.c_str(), (type==CL_DEVICE_TYPE_GPU ? "GPU" : "CPU"), extensions.c_str() );
         }
     }
-    catch(cl::Error& e)
+    catch(const cl::Error& e)
     {
         LOGE( "OpenCL info: error while gathering OpenCL info: %s (%d)", e.what(), e.err() );
     }
-    catch(std::exception& e)
+    catch(const std::exception& e)
     {
         LOGE( "OpenCL info: error while gathering OpenCL info: %s", e.what() );
     }
@@ -130,11 +130,11 @@ extern "C" void initCL()
             LOGE("Can't init OpenCV with OpenCL TAPI");
         haveOpenCL = true;
     }
-    catch(cl::Error& e)
+    catch(const cl::Error& e)
     {
         LOGE("cl::Error: %s (%d)", e.what(), e.err());
     }
-    catch(std::exception& e)
+    catch(const std::exception& e)
     {
         LOGE("std::exception: %s", e.what());
     }
diff --git a/samples/cpp/detect_blob.cpp b/samples/cpp/detect_blob.cpp
index db86a07ee2..a53743f44e 100644
--- a/samples/cpp/detect_blob.cpp
+++ b/samples/cpp/detect_blob.cpp
@@ -192,7 +192,7 @@ int main(int argc, char *argv[])
             imshow("Original", img);
             waitKey();
         }
-        catch (Exception& e)
+        catch (const Exception& e)
         {
             cout << "Feature : " << *itDesc << "\n";
             cout << e.msg << endl;
diff --git a/samples/cpp/detect_mser.cpp b/samples/cpp/detect_mser.cpp
index 7c2833fea4..d42e18b5b0 100644
--- a/samples/cpp/detect_mser.cpp
+++ b/samples/cpp/detect_mser.cpp
@@ -523,7 +523,7 @@ int main(int argc, char *argv[])
             imshow(winName, result);
             imshow("Original", img);
         }
-        catch (Exception& e)
+        catch (const Exception& e)
         {
             cout << "Feature: " << *itDesc << "\n";
             cout << e.msg << endl;
diff --git a/samples/cpp/live_detect_qrcode.cpp b/samples/cpp/live_detect_qrcode.cpp
index 0c938257b2..07101da9fa 100644
--- a/samples/cpp/live_detect_qrcode.cpp
+++ b/samples/cpp/live_detect_qrcode.cpp
@@ -177,7 +177,7 @@ int showImageQRCodeDetect(string in, string out)
         {
             imwrite(out, color_src, compression_params);
         }
-        catch (cv::Exception& ex)
+        catch (const cv::Exception& ex)
         {
             cout << "Exception converting image to PNG format: ";
             cout << ex.what() << '\n';
diff --git a/samples/cpp/matchmethod_orb_akaze_brisk.cpp b/samples/cpp/matchmethod_orb_akaze_brisk.cpp
index 7d39bae443..1eb0ded535 100644
--- a/samples/cpp/matchmethod_orb_akaze_brisk.cpp
+++ b/samples/cpp/matchmethod_orb_akaze_brisk.cpp
@@ -147,15 +147,15 @@ int main(int argc, char *argv[])
                     desMethCmp.push_back(cumSumDist2);
                     waitKey();
                 }
-                catch (Exception& e)
-                    {
+                catch (const Exception& e)
+                {
                     cout << e.msg << endl;
                     cout << "Cumulative distance cannot be computed." << endl;
                     desMethCmp.push_back(-1);
-                    }
                 }
+            }
         }
-        catch (Exception& e)
+        catch (const Exception& e)
         {
             cout << "Feature : " << *itDesc << "\n";
             if (itMatcher != typeAlgoMatch.end())
diff --git a/samples/cpp/pca.cpp b/samples/cpp/pca.cpp
index fb2f585af8..ba42700f18 100644
--- a/samples/cpp/pca.cpp
+++ b/samples/cpp/pca.cpp
@@ -141,7 +141,7 @@ int main(int argc, char** argv)
     // Read in the data. This can fail if not valid
     try {
         read_imgList(imgList, images);
-    } catch (cv::Exception& e) {
+    } catch (const cv::Exception& e) {
         cerr << "Error opening file \"" << imgList << "\". Reason: " << e.msg << endl;
         exit(1);
     }
diff --git a/samples/directx/d3d10_interop.cpp b/samples/directx/d3d10_interop.cpp
index 85d2607081..e8be8fac50 100644
--- a/samples/directx/d3d10_interop.cpp
+++ b/samples/directx/d3d10_interop.cpp
@@ -260,7 +260,7 @@ public:
             }
         } // try
 
-        catch (cv::Exception& e)
+        catch (const cv::Exception& e)
         {
             std::cerr << "Exception: " << e.what() << std::endl;
             return 10;
diff --git a/samples/directx/d3d11_interop.cpp b/samples/directx/d3d11_interop.cpp
index 42691c0a63..d00f7cffb8 100644
--- a/samples/directx/d3d11_interop.cpp
+++ b/samples/directx/d3d11_interop.cpp
@@ -378,7 +378,7 @@ public:
             }
         } // try
 
-        catch (cv::Exception& e)
+        catch (const cv::Exception& e)
         {
             std::cerr << "Exception: " << e.what() << std::endl;
             cleanup();
diff --git a/samples/directx/d3d9_interop.cpp b/samples/directx/d3d9_interop.cpp
index 31a1914cf1..4806a4078d 100644
--- a/samples/directx/d3d9_interop.cpp
+++ b/samples/directx/d3d9_interop.cpp
@@ -225,7 +225,7 @@ public:
             }
         }  // try
 
-        catch (cv::Exception& e)
+        catch (const cv::Exception& e)
         {
             std::cerr << "Exception: " << e.what() << std::endl;
             return 10;
diff --git a/samples/directx/d3d9ex_interop.cpp b/samples/directx/d3d9ex_interop.cpp
index ef03bd625a..24258e3702 100644
--- a/samples/directx/d3d9ex_interop.cpp
+++ b/samples/directx/d3d9ex_interop.cpp
@@ -226,7 +226,7 @@ public:
 
         } // try
 
-        catch (cv::Exception& e)
+        catch (const cv::Exception& e)
         {
             std::cerr << "Exception: " << e.what() << std::endl;
             return 10;
diff --git a/samples/directx/d3dsample.hpp b/samples/directx/d3dsample.hpp
index b082ff9c92..b78af8ef1b 100644
--- a/samples/directx/d3dsample.hpp
+++ b/samples/directx/d3dsample.hpp
@@ -158,7 +158,7 @@ int d3d_app(int argc, char** argv, std::string& title)
         return app.run();
     }
 
-    catch (cv::Exception& e)
+    catch (const cv::Exception& e)
     {
         std::cerr << "Exception: " << e.what() << std::endl;
         return 10;
diff --git a/samples/dnn/tf_text_graph_faster_rcnn.py b/samples/dnn/tf_text_graph_faster_rcnn.py
index a6db8dcd4a..13a9c29ec0 100644
--- a/samples/dnn/tf_text_graph_faster_rcnn.py
+++ b/samples/dnn/tf_text_graph_faster_rcnn.py
@@ -32,6 +32,8 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
     width_stride = float(grid_anchor_generator['width_stride'][0])
     height_stride = float(grid_anchor_generator['height_stride'][0])
     features_stride = float(config['feature_extractor'][0]['first_stage_features_stride'][0])
+    first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0])
+    first_stage_max_proposals = int(config['first_stage_max_proposals'][0])
 
     print('Number of classes: %d' % num_classes)
     print('Scales:            %s' % str(scales))
@@ -47,7 +49,8 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
     removeIdentity(graph_def)
 
     def to_remove(name, op):
-        return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep)
+        return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
+               (name.startswith('CropAndResize') and op != 'CropAndResize')
 
     removeUnusedNodesAndAttrs(to_remove, graph_def)
 
@@ -114,10 +117,10 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
     detectionOut.addAttr('num_classes', 2)
     detectionOut.addAttr('share_location', True)
     detectionOut.addAttr('background_label_id', 0)
-    detectionOut.addAttr('nms_threshold', 0.7)
+    detectionOut.addAttr('nms_threshold', first_stage_nms_iou_threshold)
     detectionOut.addAttr('top_k', 6000)
     detectionOut.addAttr('code_type', "CENTER_SIZE")
-    detectionOut.addAttr('keep_top_k', 100)
+    detectionOut.addAttr('keep_top_k', first_stage_max_proposals)
     detectionOut.addAttr('clip', False)
 
     graph_def.node.extend([detectionOut])
@@ -147,9 +150,11 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
               'SecondStageBoxPredictor/Reshape_1/Reshape', [1, -1], graph_def)
 
     # Replace Flatten subgraph onto a single node.
+    cropAndResizeNodeName = ''
     for i in reversed(range(len(graph_def.node))):
         if graph_def.node[i].op == 'CropAndResize':
             graph_def.node[i].input.insert(1, 'detection_out/clip_by_value')
+            cropAndResizeNodeName = graph_def.node[i].name
 
         if graph_def.node[i].name == 'SecondStageBoxPredictor/Reshape':
             addConstNode('SecondStageBoxPredictor/Reshape/shape2', [1, -1, 4], graph_def)
@@ -159,11 +164,15 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
 
         if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
                                       'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
-                                      'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape']:
+                                      'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape',
+                                      'SecondStageBoxPredictor/Flatten_1/flatten/Shape',
+                                      'SecondStageBoxPredictor/Flatten_1/flatten/strided_slice',
+                                      'SecondStageBoxPredictor/Flatten_1/flatten/Reshape/shape']:
             del graph_def.node[i]
 
     for node in graph_def.node:
-        if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape':
+        if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape' or \
+           node.name == 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape':
             node.op = 'Flatten'
             node.input.pop()
 
@@ -171,6 +180,11 @@ def createFasterRCNNGraph(modelPath, configPath, outputPath):
                          'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
             node.addAttr('loc_pred_transposed', True)
 
+        if node.name.startswith('MaxPool2D'):
+            assert(node.op == 'MaxPool')
+            assert(cropAndResizeNodeName)
+            node.input = [cropAndResizeNodeName]
+
     ################################################################################
     ### Postprocessing
     ################################################################################
diff --git a/samples/opencl/opencl-opencv-interop.cpp b/samples/opencl/opencl-opencv-interop.cpp
index d2961af777..816550feaf 100644
--- a/samples/opencl/opencl-opencv-interop.cpp
+++ b/samples/opencl/opencl-opencv-interop.cpp
@@ -676,7 +676,7 @@ int App::initVideoSource()
             throw std::runtime_error(std::string("specify video source"));
     }
 
-    catch (std::exception e)
+    catch (const std::exception e)
     {
         cerr << "ERROR: " << e.what() << std::endl;
         return -1;
diff --git a/samples/opengl/opengl_interop.cpp b/samples/opengl/opengl_interop.cpp
index d69f9e2476..7fbf4d6084 100644
--- a/samples/opengl/opengl_interop.cpp
+++ b/samples/opengl/opengl_interop.cpp
@@ -325,7 +325,7 @@ public:
         }
 
 
-        catch (cv::Exception& e)
+        catch (const cv::Exception& e)
         {
             std::cerr << "Exception: " << e.what() << std::endl;
             return 10;
@@ -520,7 +520,7 @@ int main(int argc, char** argv)
         app.create();
         return app.run();
     }
-    catch (cv::Exception& e)
+    catch (const cv::Exception& e)
     {
         cerr << "Exception: " << e.what() << endl;
         return 10;
diff --git a/samples/python/digits_video.py b/samples/python/digits_video.py
index f669639f94..5cfcb57c04 100755
--- a/samples/python/digits_video.py
+++ b/samples/python/digits_video.py
@@ -86,7 +86,7 @@ def main():
                 frame[y:,x+w:][:SZ, :SZ] = bin_norm[...,np.newaxis]
 
             sample = preprocess_hog([bin_norm])
-            digit = model.predict(sample)[0]
+            digit = model.predict(sample)[1].ravel()
             cv.putText(frame, '%d'%digit, (x, y), cv.FONT_HERSHEY_PLAIN, 1.0, (200, 0, 0), thickness = 1)
 
 
diff --git a/samples/va_intel/va_intel_interop.cpp b/samples/va_intel/va_intel_interop.cpp
index 770e9dbf41..c4cf822f7a 100644
--- a/samples/va_intel/va_intel_interop.cpp
+++ b/samples/va_intel/va_intel_interop.cpp
@@ -256,7 +256,7 @@ int main(int argc, char** argv)
 
         std::cout << "Interop " << (doInterop ? "ON " : "OFF") << ": processing time, msec: " << time << std::endl;
     }
-    catch (std::exception& ex)
+    catch (const std::exception& ex)
     {
         std::cerr << "ERROR: " << ex.what() << std::endl;
     }