diff --git a/CMakeLists.txt b/CMakeLists.txt
index 966505aacc..4b0d58e1f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -316,7 +316,7 @@ OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add
 OCV_OPTION(ENABLE_COVERAGE            "Enable coverage collection with  GCov"                    OFF  IF CV_GCC )
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CV_GCC )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
-OCV_OPTION(ENABLE_VSX                 "Enable POWER8 and above VSX (64-bit little-endian)"       ON   IF (CV_GCC AND PPC64LE) )
+OCV_OPTION(ENABLE_VSX                 "Enable POWER8 and above VSX (64-bit little-endian)"       ON   IF ((CV_GCC OR CV_CLANG) AND PPC64LE) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CV_GCC AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index f6171d74c6..72472a6be9 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -46,13 +46,13 @@
 /* Cocoa API */
 #cmakedefine HAVE_COCOA
 
-/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
+/* NVIDIA CUDA Basic Linear Algebra Subprograms (BLAS) API*/
 #cmakedefine HAVE_CUBLAS
 
-/* NVidia Cuda Runtime API*/
+/* NVIDIA CUDA Runtime API*/
 #cmakedefine HAVE_CUDA
 
-/* NVidia Cuda Fast Fourier Transform (FFT) API*/
+/* NVIDIA CUDA Fast Fourier Transform (FFT) API*/
 #cmakedefine HAVE_CUFFT
 
 /* IEEE1394 capturing support */
@@ -124,10 +124,10 @@
 /* Microsoft Media Foundation Capture library */
 #cmakedefine HAVE_MSMF
 
-/* NVidia Video Decoding API*/
+/* NVIDIA Video Decoding API*/
 #cmakedefine HAVE_NVCUVID
 
-/* NVidia Video Encoding API*/
+/* NVIDIA Video Encoding API*/
 #cmakedefine HAVE_NVCUVENC
 
 /* OpenCL Support */
diff --git a/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown b/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown
index e7ba511750..9452bc38d7 100644
--- a/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown
+++ b/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown
@@ -8,7 +8,7 @@ Goal
 In the @ref tutorial_video_input_psnr_ssim tutorial I already presented the PSNR and SSIM methods for checking
 the similarity between the two images. And as you could see, the execution process takes quite some
 time , especially in the case of the SSIM. However, if the performance numbers of an OpenCV
-implementation for the CPU do not satisfy you and you happen to have an NVidia CUDA GPU device in
+implementation for the CPU do not satisfy you and you happen to have an NVIDIA CUDA GPU device in
 your system, all is not lost. You may try to port or write your owm algorithm for the video card.
 
 This tutorial will give a good grasp on how to approach coding by using the GPU module of OpenCV. As
@@ -187,7 +187,7 @@ introduce asynchronous OpenCV GPU calls too with the help of the @ref cv::cuda::
 Result and conclusion
 ---------------------
 
-On an Intel P8700 laptop CPU paired with a low end NVidia GT220M, here are the performance numbers:
+On an Intel P8700 laptop CPU paired with a low end NVIDIA GT220M, here are the performance numbers:
 @code
 Time of PSNR CPU (averaged for 10 runs): 41.4122 milliseconds. With result of: 19.2506
 Time of PSNR GPU (averaged for 10 runs): 158.977 milliseconds. With result of: 19.2506
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 5545030b91..199be099c8 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -754,5 +754,6 @@ CV_EXPORTS_W void setUseIPP_NE(bool flag);
 
 #include "opencv2/core/neon_utils.hpp"
 #include "opencv2/core/vsx_utils.hpp"
+#include "opencv2/core/check.hpp"
 
 #endif //OPENCV_CORE_BASE_HPP
diff --git a/modules/core/include/opencv2/core/check.hpp b/modules/core/include/opencv2/core/check.hpp
new file mode 100644
index 0000000000..92d114dfab
--- /dev/null
+++ b/modules/core/include/opencv2/core/check.hpp
@@ -0,0 +1,135 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CHECK_HPP
+#define OPENCV_CORE_CHECK_HPP
+
+#include <opencv2/core/base.hpp>
+
+namespace cv {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or "<invalid depth>" */
+CV_EXPORTS const char* depthToString(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or "<invalid type>" */
+CV_EXPORTS const String typeToString(int type);
+
+
+//! @cond IGNORED
+namespace detail {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or NULL */
+CV_EXPORTS const char* depthToString_(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or cv::String() */
+CV_EXPORTS const cv::String typeToString_(int type);
+
+enum TestOp {
+  TEST_CUSTOM = 0,
+  TEST_EQ = 1,
+  TEST_NE = 2,
+  TEST_LE = 3,
+  TEST_LT = 4,
+  TEST_GE = 5,
+  TEST_GT = 6,
+  CV__LAST_TEST_OP
+};
+
+struct CheckContext {
+    const char* func;
+    const char* file;
+    int line;
+    enum TestOp testOp;
+    const char* message;
+    const char* p1_str;
+    const char* p2_str;
+};
+
+#ifndef CV__CHECK_FILENAME
+# define CV__CHECK_FILENAME __FILE__
+#endif
+
+#ifndef CV__CHECK_FUNCTION
+# if defined _MSC_VER
+#   define CV__CHECK_FUNCTION __FUNCSIG__
+# elif defined __GNUC__
+#   define CV__CHECK_FUNCTION __PRETTY_FUNCTION__
+# else
+#   define CV__CHECK_FUNCTION "<unknown>"
+# endif
+#endif
+
+#define CV__CHECK_LOCATION_VARNAME(id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_check_, id), __LINE__)
+#define CV__DEFINE_CHECK_CONTEXT(id, message, testOp, p1_str, p2_str) \
+    static const cv::detail::CheckContext CV__CHECK_LOCATION_VARNAME(id) = \
+            { CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, message, p1_str, p2_str }
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v1, const float v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v1, const double v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx);
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);
+
+
+#define CV__TEST_EQ(v1, v2) ((v1) == (v2))
+#define CV__TEST_NE(v1, v2) ((v1) != (v2))
+#define CV__TEST_LE(v1, v2) ((v1) <= (v2))
+#define CV__TEST_LT(v1, v2) ((v1) < (v2))
+#define CV__TEST_GE(v1, v2) ((v1) >= (v2))
+#define CV__TEST_GT(v1, v2) ((v1) > (v2))
+
+#define CV__CHECK(id, op, type, v1, v2, v1_str, v2_str, msg_str) do { \
+    if(CV__TEST_##op((v1), (v2))) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_ ## op, v1_str, v2_str); \
+        cv::detail::check_failed_ ## type((v1), (v2), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+#define CV__CHECK_CUSTOM_TEST(id, type, v, test_expr, v_str, test_expr_str, msg_str) do { \
+    if(!!(test_expr)) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_CUSTOM, v_str, test_expr_str); \
+        cv::detail::check_failed_ ## type((v), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+} // namespace
+//! @endcond
+
+
+/// Supported values of these types: int, float, double
+#define CV_CheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+
+/// Check with additional "decoding" of type values in error message
+#define CV_CheckTypeEQ(t1, t2, msg)  CV__CHECK(_, EQ, MatType, t1, t2, #t1, #t2, msg)
+/// Check with additional "decoding" of depth values in error message
+#define CV_CheckDepthEQ(d1, d2, msg)  CV__CHECK(_, EQ, MatDepth, d1, d2, #d1, #d2, msg)
+
+#define CV_CheckChannelsEQ(c1, c2, msg)  CV__CHECK(_, EQ, MatChannels, c1, c2, #c1, #c2, msg)
+
+
+/// Example: type == CV_8UC1 || type == CV_8UC3
+#define CV_CheckType(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatType, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: depth == CV_32F || depth == CV_64F
+#define CV_CheckDepth(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatDepth, t, (test_expr), #t, #test_expr, msg)
+
+/// Some complex conditions: CV_Check(src2, src2.empty() || (src2.type() == src1.type() && src2.size() == src1.size()), "src2 should have same size/type as src1")
+// TODO define pretty-printers: #define CV_Check(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+
+} // namespace
+
+#endif // OPENCV_CORE_CHECK_HPP
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index e7ea899b7e..5518eace9b 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
 /** @brief Multiply and add
 
 Returns \f$ a*b + c \f$
-For floating point types only. */
+For floating point types and signed 32bit int only. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                               const v_reg<_Tp, n>& c)
@@ -828,6 +828,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
     return c;
 }
 
+/** @brief Dot product of elements
+
+Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+  {A1B1+A2B2+C1 ...} // 32-bit
+
+@endcode
+Implemented only for 16-bit signed source type (v_int16x8).
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> s;
+    for( int i = 0; i < (n/2); i++ )
+        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
+    return s;
+}
+
 /** @brief Multiply and expand
 
 Multiply values two registers and store results in two registers with wider pack type.
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index c3c49c902b..033cf0f2dc 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
     return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
 }
 
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    v_int32x4 s = v_dotprod(a, b);
+    return v_int32x4(vaddq_s32(s.val , c.val));
+}
+
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
@@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
 }
 
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
+}
+
 #if CV_SIMD128_64F
 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
 {
@@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
 #endif
 
+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    float32x4_t a_ = a.val;
+    int32x4_t result;
+    __asm__ ("fcvtns %0.4s, %1.4s"
+             : "=w"(result)
+             : "w"(a_)
+             : /* No clobbers */);
+    return v_int32x4(result);
+}
+#else
 inline v_int32x4 v_round(const v_float32x4& a)
 {
     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
@@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a)
     int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
     return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
 }
-
+#endif
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
     int32x4_t a1 = vcvtq_s32_f32(a.val);
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
index 9195d7baeb..62ffa3ec88 100644
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
     return v_int32x4(_mm_madd_epi16(a.val, b.val));
 }
 
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
+}
+
 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
     OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
     OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
@@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
     __m128i m = _mm_cmpgt_epi32(b.val, a.val);
     return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
 }
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}
 
 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
@@ -1632,7 +1641,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2&
     c = v_reinterpret_as_f64(t2);
 }
 
-// 2-channel, float only
+// 2-channel
 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
 {
     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
@@ -1644,7 +1653,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
     b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
 }
 
-inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
+inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    v_int16x8 sa, sb;
+    v_load_deinterleave((const short*)ptr, sa, sb);
+    a = v_reinterpret_as_u16(sa);
+    b = v_reinterpret_as_u16(sb);
+}
+
+inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
 {
     __m128i t0, t1;
     t0 = _mm_unpacklo_epi16(a.val, b.val);
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index 9f050f7c21..85cef469c3 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -1,46 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
 
 #ifndef OPENCV_HAL_VSX_HPP
 #define OPENCV_HAL_VSX_HPP
@@ -276,34 +236,38 @@ OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
 OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
 OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)
 
-#define OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(_Tpvec, _Tp, ld_func, st_func) \
+#define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a)    \
 inline _Tpvec v_load(const _Tp* ptr)                                        \
-{ return _Tpvec(ld_func(0, ptr)); }                                         \
-inline _Tpvec v_load_aligned(const _Tp* ptr)                                \
-{ return _Tpvec(ld_func(0, ptr)); }                                         \
+{ return _Tpvec(ld(0, ptr)); }                                              \
+inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr))                    \
+{ return _Tpvec(ld_a(0, ptr)); }                                            \
 inline _Tpvec v_load_low(const _Tp* ptr)                                    \
 { return _Tpvec(vec_ld_l8(ptr)); }                                          \
 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)               \
 { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); }          \
 inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
-{ st_func(a.val, 0, ptr); }                                                 \
-inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                      \
-{ st_func(a.val, 0, ptr); }                                                 \
+{ st(a.val, 0, ptr); }                                                      \
+inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
+{ st_a(a.val, 0, ptr); }                                                    \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
 { vec_st_l8(a.val, ptr); }                                                  \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
 { vec_st_h8(a.val, ptr); }
 
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint8x16, uchar, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int8x16, schar, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint16x8, ushort, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int16x8, short, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint32x4, uint, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int32x4, int, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_float32x4, float, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_float64x2, double, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint64x2, uint64, vsx_ld2, vsx_st2)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int64x2, int64, vsx_ld2, vsx_st2)
+#define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16,  uchar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16,   schar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8,  ushort)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8,   short)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4,  uint)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4,   int)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld,  vsx_ld,  vsx_st,  vsx_st)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2,  uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2,    int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
 
 //////////////// Value reordering ///////////////
 
@@ -343,7 +307,7 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
     b1.val = fl(a.val);                                           \
 }                                                                 \
 inline _Tpwvec v_load_expand(const _Tp* ptr)                      \
-{ return _Tpwvec(fh(vsx_ld(0, ptr))); }
+{ return _Tpwvec(fh(vec_ld_l8(ptr))); }
 
 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
 OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
@@ -353,10 +317,10 @@ OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpac
 OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
 
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{ return v_uint32x4(vec_ld_buw(ptr)); }
+{ return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
 
 inline v_int32x4 v_load_expand_q(const schar* ptr)
-{ return v_int32x4(vec_ld_bsw(ptr)); }
+{ return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
 
 /* pack */
 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack)    \
@@ -429,36 +393,6 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
     d.val = vec_mergesql(a.val, b.val);
 }
 
-/* Extract */
-template<int s, typename _Tpvec>
-inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
-{
-    const int w = sizeof(typename _Tpvec::lane_type);
-    const int n = _Tpvec::nlanes;
-    const unsigned int sf = ((w * n) - (s * w));
-    if (s == 0)
-        return _Tpvec(a.val);
-    else if (sf > 15)
-        return _Tpvec();
-    // bitwise it just to make xlc happy
-    return _Tpvec(vec_sld(b.val, a.val, sf & 15));
-}
-
-#define OPENCV_HAL_IMPL_VSX_EXTRACT_2(_Tpvec)             \
-template<int s>                                           \
-inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
-{                                                         \
-    switch(s) {                                           \
-    case 0: return _Tpvec(a.val);                         \
-    case 2: return _Tpvec(b.val);                         \
-    case 1: return _Tpvec(vec_sldw(b.val, a.val, 2));     \
-    default: return _Tpvec();                             \
-    }                                                     \
-}
-OPENCV_HAL_IMPL_VSX_EXTRACT_2(v_uint64x2)
-OPENCV_HAL_IMPL_VSX_EXTRACT_2(v_int64x2)
-
-
 ////////// Arithmetic, bitwise and comparison operations /////////
 
 /* Element-wise binary and unary operations */
@@ -669,6 +603,11 @@ OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
 OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  left, b, a)
 OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
 
+/* Extract */
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{ return v_rotate_right<s>(a, b); }
+
 ////////// Reduce and mask /////////
 
 /** Reduce **/
@@ -821,6 +760,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
 OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
 
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return a * b + c; }
+
 // TODO: exp, log, sin, cos
 
 /** Absolute values **/
@@ -904,6 +846,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
 
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp
index c377551364..21b50e8611 100644
--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@@ -1,46 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
 
 #ifndef OPENCV_HAL_VSX_UTILS_HPP
 #define OPENCV_HAL_VSX_UTILS_HPP
@@ -64,106 +24,77 @@ typedef __vector unsigned char vec_uchar16;
 #define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
 #define vec_uchar16_sp(c)    (__VSX_S16__(vec_uchar16, c))
 #define vec_uchar16_c(v)     ((vec_uchar16)(v))
-#define vec_uchar16_mx       vec_uchar16_sp(0xFF)
-#define vec_uchar16_mn       vec_uchar16_sp(0)
-#define vec_uchar16_z        vec_uchar16_mn
+#define vec_uchar16_z        vec_uchar16_sp(0)
 
 typedef __vector signed char vec_char16;
 #define vec_char16_set(...) (vec_char16){__VA_ARGS__}
 #define vec_char16_sp(c)    (__VSX_S16__(vec_char16, c))
 #define vec_char16_c(v)     ((vec_char16)(v))
-#define vec_char16_mx       vec_char16_sp(0x7F)
-#define vec_char16_mn       vec_char16_sp(-0x7F-1)
 #define vec_char16_z        vec_char16_sp(0)
 
 typedef __vector unsigned short vec_ushort8;
 #define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
 #define vec_ushort8_sp(c)    (__VSX_S8__(vec_ushort8, c))
 #define vec_ushort8_c(v)     ((vec_ushort8)(v))
-#define vec_ushort8_mx       vec_ushort8_sp(0xFFFF)
-#define vec_ushort8_mn       vec_ushort8_sp(0)
-#define vec_ushort8_z        vec_ushort8_mn
+#define vec_ushort8_z        vec_ushort8_sp(0)
 
 typedef __vector signed short vec_short8;
 #define vec_short8_set(...) (vec_short8){__VA_ARGS__}
 #define vec_short8_sp(c)    (__VSX_S8__(vec_short8, c))
 #define vec_short8_c(v)     ((vec_short8)(v))
-#define vec_short8_mx       vec_short8_sp(0x7FFF)
-#define vec_short8_mn       vec_short8_sp(-0x7FFF-1)
 #define vec_short8_z        vec_short8_sp(0)
 
 typedef __vector unsigned int vec_uint4;
 #define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
 #define vec_uint4_sp(c)    (__VSX_S4__(vec_uint4, c))
 #define vec_uint4_c(v)     ((vec_uint4)(v))
-#define vec_uint4_mx       vec_uint4_sp(0xFFFFFFFFU)
-#define vec_uint4_mn       vec_uint4_sp(0)
-#define vec_uint4_z        vec_uint4_mn
+#define vec_uint4_z        vec_uint4_sp(0)
 
 typedef __vector signed int vec_int4;
 #define vec_int4_set(...)  (vec_int4){__VA_ARGS__}
 #define vec_int4_sp(c)     (__VSX_S4__(vec_int4, c))
 #define vec_int4_c(v)      ((vec_int4)(v))
-#define vec_int4_mx        vec_int4_sp(0x7FFFFFFF)
-#define vec_int4_mn        vec_int4_sp(-0x7FFFFFFF-1)
 #define vec_int4_z         vec_int4_sp(0)
 
 typedef __vector float vec_float4;
 #define vec_float4_set(...)  (vec_float4){__VA_ARGS__}
 #define vec_float4_sp(c)     (__VSX_S4__(vec_float4, c))
 #define vec_float4_c(v)      ((vec_float4)(v))
-#define vec_float4_mx        vec_float4_sp(3.40282347E+38F)
-#define vec_float4_mn        vec_float4_sp(1.17549435E-38F)
 #define vec_float4_z         vec_float4_sp(0)
 
 typedef __vector unsigned long long vec_udword2;
 #define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
 #define vec_udword2_sp(c)    (__VSX_S2__(vec_udword2, c))
 #define vec_udword2_c(v)     ((vec_udword2)(v))
-#define vec_udword2_mx       vec_udword2_sp(18446744073709551615ULL)
-#define vec_udword2_mn       vec_udword2_sp(0)
-#define vec_udword2_z        vec_udword2_mn
+#define vec_udword2_z        vec_udword2_sp(0)
 
 typedef __vector signed long long vec_dword2;
 #define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
 #define vec_dword2_sp(c)    (__VSX_S2__(vec_dword2, c))
 #define vec_dword2_c(v)     ((vec_dword2)(v))
-#define vec_dword2_mx       vec_dword2_sp(9223372036854775807LL)
-#define vec_dword2_mn       vec_dword2_sp(-9223372036854775807LL-1)
 #define vec_dword2_z        vec_dword2_sp(0)
 
 typedef  __vector double vec_double2;
 #define vec_double2_set(...) (vec_double2){__VA_ARGS__}
 #define vec_double2_c(v)     ((vec_double2)(v))
 #define vec_double2_sp(c)    (__VSX_S2__(vec_double2, c))
-#define vec_double2_mx       vec_double2_sp(1.7976931348623157E+308)
-#define vec_double2_mn       vec_double2_sp(2.2250738585072014E-308)
 #define vec_double2_z        vec_double2_sp(0)
 
 #define vec_bchar16           __vector __bool char
 #define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
 #define vec_bchar16_c(v)     ((vec_bchar16)(v))
-#define vec_bchar16_f        (__VSX_S16__(vec_bchar16, 0))
-#define vec_bchar16_t        (__VSX_S16__(vec_bchar16, 1))
 
 #define vec_bshort8           __vector __bool short
 #define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
 #define vec_bshort8_c(v)     ((vec_bshort8)(v))
-#define vec_bshort8_f        (__VSX_S8__(vec_bshort8, 0))
-#define vec_bshort8_t        (__VSX_S8__(vec_bshort8, 1))
 
 #define vec_bint4             __vector __bool int
 #define vec_bint4_set(...)   (vec_bint4){__VA_ARGS__}
 #define vec_bint4_c(v)       ((vec_bint4)(v))
-#define vec_bint4_f          (__VSX_S4__(vec_bint4, 0))
-#define vec_bint4_t          (__VSX_S4__(vec_bint4, 1))
 
 #define vec_bdword2            __vector __bool long long
 #define vec_bdword2_set(...)  (vec_bdword2){__VA_ARGS__}
 #define vec_bdword2_c(v)      ((vec_bdword2)(v))
-#define vec_bdword2_f         (__VSX_S2__(vec_bdword2, 0))
-#define vec_bdword2_t         (__VSX_S2__(vec_bdword2, 1))
-
 
 #define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
 
@@ -688,34 +619,17 @@ VSX_IMPL_CONV_ODD_2_4(vec_uint4,  vec_double2, vec_ctuo, vec_ctu)
     { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
 #endif
 
-// load 4 unsigned bytes into uint4 vector
-#define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
-
-// load 4 signed bytes into int4 vector
-#define vec_ld_bsw(p) vec_int4_set((p)[0], (p)[1], (p)[2], (p)[3])
-
-// load 4 unsigned bytes into float vector
-#define vec_ld_bps(p) vec_ctf(vec_ld_buw(p), 0)
-
 // Store lower 8 byte
 #define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
 
 // Store higher 8 byte
 #define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
 
-/*
- * vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
- * vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
-**/
-#define VSX_IMPL_LOAD_L8(Tvec, Tp)                                              \
-VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p)                                        \
-{ return ((Tvec)vec_promote(*((uint64*)p), 0)); }                               \
-VSX_FINLINE(Tvec) vec_ldz_l8(const Tp *p)                                       \
-{                                                                               \
-    /* TODO: try (Tvec)(vec_udword2{*((uint64*)p), 0}) */                       \
-    static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000};   \
-    return vec_and(vec_ld_l8(p), (Tvec)mask);                                   \
-}
+// Load 64-bits of integer data to lower part
+#define VSX_IMPL_LOAD_L8(Tvec, Tp)                  \
+VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p)            \
+{ return ((Tvec)vec_promote(*((uint64*)p), 0)); }
+
 VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
 VSX_IMPL_LOAD_L8(vec_char16,  schar)
 VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
@@ -745,11 +659,11 @@ VSX_IMPL_LOAD_L8(vec_double2, double)
  * Implement vec_unpacklu and vec_unpackhu
  * since vec_unpackl, vec_unpackh only support signed integers
 **/
-#define VSX_IMPL_UNPACKU(rt, rg, zero)                 \
-VSX_FINLINE(rt) vec_unpacklu(const rg& a)              \
-{ return reinterpret_cast<rt>(vec_mergel(a, zero)); }  \
-VSX_FINLINE(rt) vec_unpackhu(const rg& a)              \
-{ return reinterpret_cast<rt>(vec_mergeh(a, zero));  }
+#define VSX_IMPL_UNPACKU(rt, rg, zero)      \
+VSX_FINLINE(rt) vec_unpacklu(const rg& a)   \
+{ return (rt)(vec_mergel(a, zero)); }       \
+VSX_FINLINE(rt) vec_unpackhu(const rg& a)   \
+{ return (rt)(vec_mergeh(a, zero));  }
 
 VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
 VSX_IMPL_UNPACKU(vec_uint4,   vec_ushort8, vec_ushort8_z)
diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp
new file mode 100644
index 0000000000..966d2cad20
--- /dev/null
+++ b/modules/core/src/check.cpp
@@ -0,0 +1,160 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+#include "opencv2/core/check.hpp"
+
+namespace cv {
+
+const char* depthToString(int depth)
+{
+    const char* s = detail::depthToString_(depth);
+    return s ? s : "<invalid depth>";
+}
+
+const cv::String typeToString(int type)
+{
+    cv::String s = detail::typeToString_(type);
+    if (s.empty())
+    {
+        static cv::String invalidType("<invalid type>");
+        return invalidType;
+    }
+    return s;
+}
+
+
+namespace detail {
+
+static const char* getTestOpPhraseStr(unsigned testOp)
+{
+    static const char* _names[] = { "{custom check}", "equal to", "not equal to", "less than or equal to", "less than", "greater than or equal to", "greater than" };
+    CV_DbgAssert(testOp < CV__LAST_TEST_OP);
+    return testOp < CV__LAST_TEST_OP ? _names[testOp] : "???";
+}
+static const char* getTestOpMath(unsigned testOp)
+{
+    static const char* _names[] = { "???", "==", "!=", "<=", "<", ">=", ">" };
+    CV_DbgAssert(testOp < CV__LAST_TEST_OP);
+    return testOp < CV__LAST_TEST_OP ? _names[testOp] : "???";
+}
+
+const char* depthToString_(int depth)
+{
+    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" };
+    return depth <= CV_USRTYPE1 ? depthNames[depth] : NULL;
+}
+
+const cv::String typeToString_(int type)
+{
+    int depth = CV_MAT_DEPTH(type);
+    int cn = CV_MAT_CN(type);
+    if (depth >= 0 && depth <= CV_USRTYPE1)
+        return cv::format("%sC%d", depthToString_(depth), cn);
+    return cv::String();
+}
+
+template<typename T> static CV_NORETURN
+void check_failed_auto_(const T& v1, const T& v2, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << " (expected: '" << ctx.p1_str << " " << getTestOpMath(ctx.testOp) << " " << ctx.p2_str << "'), where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v1 << std::endl;
+    if (ctx.testOp != TEST_CUSTOM && ctx.testOp < CV__LAST_TEST_OP)
+    {
+        ss << "must be " << getTestOpPhraseStr(ctx.testOp) << std::endl;
+    }
+    ss  << "    '" << ctx.p2_str << "' is " << v2;
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << " (expected: '" << ctx.p1_str << " " << getTestOpMath(ctx.testOp) << " " << ctx.p2_str << "'), where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v1 << " (" << depthToString(v1) << ")" << std::endl;
+    if (ctx.testOp != TEST_CUSTOM && ctx.testOp < CV__LAST_TEST_OP)
+    {
+        ss << "must be " << getTestOpPhraseStr(ctx.testOp) << std::endl;
+    }
+    ss  << "    '" << ctx.p2_str << "' is " << v2 << " (" << depthToString(v2) << ")";
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatType(const int v1, const int v2, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << " (expected: '" << ctx.p1_str << " " << getTestOpMath(ctx.testOp) << " " << ctx.p2_str << "'), where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v1 << " (" << typeToString(v1) << ")" << std::endl;
+    if (ctx.testOp != TEST_CUSTOM && ctx.testOp < CV__LAST_TEST_OP)
+    {
+        ss << "must be " << getTestOpPhraseStr(ctx.testOp) << std::endl;
+    }
+    ss  << "    '" << ctx.p2_str << "' is " << v2 << " (" << typeToString(v2) << ")";
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx)
+{
+    check_failed_auto_<int>(v1, v2, ctx);
+}
+void check_failed_auto(const int v1, const int v2, const CheckContext& ctx)
+{
+    check_failed_auto_<int>(v1, v2, ctx);
+}
+void check_failed_auto(const float v1, const float v2, const CheckContext& ctx)
+{
+    check_failed_auto_<float>(v1, v2, ctx);
+}
+void check_failed_auto(const double v1, const double v2, const CheckContext& ctx)
+{
+    check_failed_auto_<double>(v1, v2, ctx);
+}
+
+
+template<typename T> static CV_NORETURN
+void check_failed_auto_(const T& v, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << ":" << std::endl
+        << "    '" << ctx.p2_str << "'" << std::endl
+        << "where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v;
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatDepth(const int v, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << ":" << std::endl
+        << "    '" << ctx.p2_str << "'" << std::endl
+        << "where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v << " (" << depthToString(v) << ")";
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatType(const int v, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << ":" << std::endl
+        << "    '" << ctx.p2_str << "'" << std::endl
+        << "where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v << " (" << typeToString(v) << ")";
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatChannels(const int v, const CheckContext& ctx)
+{
+    check_failed_auto_<int>(v, ctx);
+}
+void check_failed_auto(const int v, const CheckContext& ctx)
+{
+    check_failed_auto_<int>(v, ctx);
+}
+void check_failed_auto(const float v, const CheckContext& ctx)
+{
+    check_failed_auto_<float>(v, ctx);
+}
+void check_failed_auto(const double v, const CheckContext& ctx)
+{
+    check_failed_auto_<double>(v, ctx);
+}
+
+
+}} // namespace
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index cc659302e7..86ec6afacd 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -4275,7 +4275,8 @@ public:
         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
         Context& ctx = Context::getDefault();
         cl_int retval = CL_SUCCESS;
-        CV_OCL_CHECK_(entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval), retval);
+        entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
+        CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
         CV_Assert(entry.clBuffer_ != NULL);
         if(retval == CL_SUCCESS)
         {
@@ -4669,15 +4670,19 @@ public:
                 {
                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
                                             u->size, u->origdata, &retval);
+                    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
+                            (long long int)u->size, u->origdata, (void*)handle).c_str());
                 }
                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
                 {
                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
                                                u->size, u->origdata, &retval);
+                    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
+                            (long long int)u->size, u->origdata, (void*)handle).c_str());
                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
                 }
             }
-            CV_OCL_DBG_CHECK_RESULT(retval, "clCreateBuffer()");
+            CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
             if(!handle || retval != CL_SUCCESS)
                 return false;
             u->handle = handle;
@@ -4805,13 +4810,14 @@ public:
                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
                                 (CL_MAP_READ | CL_MAP_WRITE),
                                 0, u->size, 0, 0, 0, &retval);
-                            CV_OCL_CHECK_RESULT(retval, "clEnqueueMapBuffer()");
+                            CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
                             CV_Assert(u->origdata == data);
                             if (u->originalUMatData)
                             {
                                 CV_Assert(u->originalUMatData->data == data);
                             }
-                            CV_OCL_CHECK(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0));
+                            retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
+                            CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
                             CV_OCL_DBG_CHECK(clFinish(q));
                         }
                     }
@@ -4838,7 +4844,8 @@ public:
             else
 #endif
             {
-                CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
+                cl_int retval = clReleaseMemObject((cl_mem)u->handle);
+                CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
             }
             u->handle = 0;
             u->markDeviceCopyObsolete(true);
@@ -4955,7 +4962,7 @@ public:
                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
                                                          (CL_MAP_READ | CL_MAP_WRITE),
                                                          0, u->size, 0, 0, 0, &retval);
-                    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(sz=%lld)", (int64)u->size).c_str());
+                    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
                 }
                 if (u->data && retval == CL_SUCCESS)
                 {
@@ -4982,8 +4989,10 @@ public:
 #ifdef HAVE_OPENCL_SVM
             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
 #endif
-            CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
-                    0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
+            cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
+                    0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
+            CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
+                    (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
             u->markHostCopyObsolete(false);
         }
     }
@@ -5032,7 +5041,8 @@ public:
             if (u->refcount == 0)
             {
                 CV_Assert(u->mapcount-- == 1);
-                CV_OCL_CHECK(retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0));
+                retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
+                CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
                 if (Device::getDefault().isAMD())
                 {
                     // required for multithreaded applications (see stitching test)
@@ -5050,8 +5060,10 @@ public:
 #ifdef HAVE_OPENCL_SVM
             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
 #endif
-            CV_OCL_CHECK(retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
-                                0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
+            retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
+                                0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
+            CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
+                    (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
             u->markDeviceCopyObsolete(false);
             u->markHostCopyObsolete(true);
         }
@@ -5354,8 +5366,10 @@ public:
             if( iscontinuous )
             {
                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
-                CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
-                    dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
+                cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
+                    dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
+                CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
+                        (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
             }
             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
             {
@@ -5527,8 +5541,10 @@ public:
         {
             if( iscontinuous )
             {
-                CV_OCL_CHECK(retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
-                                               srcrawofs, dstrawofs, total, 0, 0, 0));
+                retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
+                                               srcrawofs, dstrawofs, total, 0, 0, 0);
+                CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
+                        (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
             }
             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
             {
@@ -6373,7 +6389,9 @@ struct Image2D::Impl
         if (!alias && !src.isContinuous())
         {
             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
-            CV_OCL_CHECK_RESULT(err, "clCreateBuffer()");
+            CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
+                    (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
+                ).c_str());
 
             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 574622a267..9f235e2f52 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -249,10 +249,34 @@ const char* Exception::what() const throw() { return msg.c_str(); }
 
 void Exception::formatMessage()
 {
-    if( func.size() > 0 )
-        msg = format("OpenCV(%s) %s:%d: error: (%d) %s: %s in function %s\n", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), err.c_str(), func.c_str());
+    size_t pos = err.find('\n');
+    bool multiline = pos != cv::String::npos;
+    if (multiline)
+    {
+        std::stringstream ss;
+        size_t prev_pos = 0;
+        while (pos != cv::String::npos)
+        {
+           ss << "> " << err.substr(prev_pos, pos - prev_pos) << std::endl;
+           prev_pos = pos + 1;
+           pos = err.find('\n', prev_pos);
+        }
+        ss << "> " << err.substr(prev_pos);
+        if (err[err.size() - 1] != '\n')
+            ss << std::endl;
+        err = ss.str();
+    }
+    if (func.size() > 0)
+    {
+        if (multiline)
+            msg = format("OpenCV(%s) %s:%d: error: (%d:%s) in function '%s'\n%s", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), func.c_str(), err.c_str());
+        else
+            msg = format("OpenCV(%s) %s:%d: error: (%d:%s) %s in function '%s'\n", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), err.c_str(), func.c_str());
+    }
     else
-        msg = format("OpenCV(%s) %s:%d: error: (%d) %s: %s\n", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), err.c_str());
+    {
+        msg = format("OpenCV(%s) %s:%d: error: (%d:%s) %s%s", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), err.c_str(), multiline ? "" : "\n");
+    }
 }
 
 static const char* g_hwFeatureNames[CV_HARDWARE_MAX_FEATURE] = { NULL };
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index cd9373ad6c..43d8aaff4d 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -521,15 +521,25 @@ template<typename R> struct TheTest
     TheTest & test_dot_prod()
     {
         typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename Rx2::lane_type w_type;
+
         Data<R> dataA, dataB(2);
         R a = dataA, b = dataB;
 
-        Data<Rx2> res = v_dotprod(a, b);
+        Data<Rx2> dataC;
+        dataC += std::numeric_limits<w_type>::is_signed ?
+                    std::numeric_limits<w_type>::min() :
+                    std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
+        Rx2 c = dataC;
+
+        Data<Rx2> resD = v_dotprod(a, b),
+                  resE = v_dotprod(a, b, c);
 
         const int n = R::nlanes / 2;
         for (int i = 0; i < n; ++i)
         {
-            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
+            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
+            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
         }
         return *this;
     }
diff --git a/modules/core/test/test_misc.cpp b/modules/core/test/test_misc.cpp
index ae71d8d149..b98ec4ed34 100644
--- a/modules/core/test/test_misc.cpp
+++ b/modules/core/test/test_misc.cpp
@@ -245,4 +245,503 @@ TEST(Core_Version, consistency)
     EXPECT_EQ(String(CV_VERSION), cv::getVersionString());
 }
 
+
+
+//
+// Test core/check.hpp macros
+//
+
+void test_check_eq_1(int value_1, int value_2)
+{
+    CV_CheckEQ(value_1, value_2, "Validation check failed");
+}
+TEST(Core_Check, testEQ_int_fail)
+{
+    try
+    {
+        test_check_eq_1(123, 5678);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation check failed (expected: 'value_1 == value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be equal to\n"
+">     'value_2' is 5678\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testEQ_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_eq_1(1234, 1234);
+    });
+}
+
+
+void test_check_eq_2(float value_1, float value_2)
+{
+    CV_CheckEQ(value_1, value_2, "Validation check failed (float)");
+}
+TEST(Core_Check, testEQ_float_fail)
+{
+    try
+    {
+        test_check_eq_2(1234.5f, 1234.55f);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation check failed (float) (expected: 'value_1 == value_2'), where\n"
+">     'value_1' is 1234.5\n"  // TODO Locale handling (use LC_ALL=C on Linux)
+"> must be equal to\n"
+">     'value_2' is 1234.55\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testEQ_float_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_eq_2(1234.6f, 1234.6f);
+    });
+}
+
+
+void test_check_eq_3(double value_1, double value_2)
+{
+    CV_CheckEQ(value_1, value_2, "Validation check failed (double)");
+}
+TEST(Core_Check, testEQ_double_fail)
+{
+    try
+    {
+        test_check_eq_3(1234.5, 1234.56);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation check failed (double) (expected: 'value_1 == value_2'), where\n"
+">     'value_1' is 1234.5\n"  // TODO Locale handling (use LC_ALL=C on Linux)
+"> must be equal to\n"
+">     'value_2' is 1234.56\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testEQ_double_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_eq_3(1234.0f, 1234.0f);
+    });
+}
+
+
+void test_check_ne_1(int value_1, int value_2)
+{
+    CV_CheckNE(value_1, value_2, "Validation NE check failed");
+}
+TEST(Core_Check, testNE_int_fail)
+{
+    try
+    {
+        test_check_ne_1(123, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation NE check failed (expected: 'value_1 != value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be not equal to\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testNE_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_ne_1(123, 1234);
+    });
+}
+
+
+void test_check_le_1(int value_1, int value_2)
+{
+    CV_CheckLE(value_1, value_2, "Validation LE check failed");
+}
+TEST(Core_Check, testLE_int_fail)
+{
+    try
+    {
+        test_check_le_1(1234, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation LE check failed (expected: 'value_1 <= value_2'), where\n"
+">     'value_1' is 1234\n"
+"> must be less than or equal to\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testLE_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_le_1(1234, 1234);
+    });
+    EXPECT_NO_THROW(
+    {
+        test_check_le_1(123, 1234);
+    });
+}
+
+void test_check_lt_1(int value_1, int value_2)
+{
+    CV_CheckLT(value_1, value_2, "Validation LT check failed");
+}
+TEST(Core_Check, testLT_int_fail)
+{
+    try
+    {
+        test_check_lt_1(1234, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation LT check failed (expected: 'value_1 < value_2'), where\n"
+">     'value_1' is 1234\n"
+"> must be less than\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testLT_int_fail_eq)
+{
+    try
+    {
+        test_check_lt_1(123, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation LT check failed (expected: 'value_1 < value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be less than\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testLT_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_lt_1(123, 1234);
+    });
+}
+
+
+void test_check_ge_1(int value_1, int value_2)
+{
+    CV_CheckGE(value_1, value_2, "Validation GE check failed");
+}
+TEST(Core_Check, testGE_int_fail)
+{
+    try
+    {
+        test_check_ge_1(123, 1234);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation GE check failed (expected: 'value_1 >= value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be greater than or equal to\n"
+">     'value_2' is 1234\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testGE_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_ge_1(1234, 1234);
+    });
+    EXPECT_NO_THROW(
+    {
+        test_check_ge_1(1234, 123);
+    });
+}
+
+void test_check_gt_1(int value_1, int value_2)
+{
+    CV_CheckGT(value_1, value_2, "Validation GT check failed");
+}
+TEST(Core_Check, testGT_int_fail)
+{
+    try
+    {
+        test_check_gt_1(123, 1234);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation GT check failed (expected: 'value_1 > value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be greater than\n"
+">     'value_2' is 1234\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testGT_int_fail_eq)
+{
+    try
+    {
+        test_check_gt_1(123, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation GT check failed (expected: 'value_1 > value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be greater than\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testGT_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_gt_1(1234, 123);
+    });
+}
+
+
+void test_check_MatType_1(int src_type)
+{
+    CV_CheckTypeEQ(src_type, CV_32FC1, "Unsupported source type");
+}
+TEST(Core_Check, testMatType_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_MatType_1(CV_MAKE_TYPE(CV_32F, 1));
+    });
+}
+TEST(Core_Check, testMatType_fail_1)
+{
+    try
+    {
+        test_check_MatType_1(CV_8UC1);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Unsupported source type (expected: 'src_type == CV_32FC1'), where\n"
+">     'src_type' is 0 (CV_8UC1)\n"
+"> must be equal to\n"
+">     'CV_32FC1' is 5 (CV_32FC1)\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+
+void test_check_MatType_2(int src_type)
+{
+    CV_CheckType(src_type, src_type == CV_32FC1 || src_type == CV_32FC3, "Unsupported src");
+}
+TEST(Core_Check, testMatType_fail_2)
+{
+    try
+    {
+        test_check_MatType_2(CV_8UC1);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Unsupported src:\n"
+">     'src_type == CV_32FC1 || src_type == CV_32FC3'\n"
+"> where\n>     'src_type' is 0 (CV_8UC1)\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+
+void test_check_MatDepth_1(int src_depth)
+{
+    CV_CheckDepthEQ(src_depth, CV_32F, "Unsupported source depth");
+}
+TEST(Core_Check, testMatDepth_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_MatDepth_1(CV_MAKE_TYPE(CV_32F, 1));
+    });
+}
+TEST(Core_Check, testMatDepth_fail_1)
+{
+    try
+    {
+        test_check_MatDepth_1(CV_8U);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Unsupported source depth (expected: 'src_depth == CV_32F'), where\n"
+">     'src_depth' is 0 (CV_8U)\n"
+"> must be equal to\n"
+">     'CV_32F' is 5 (CV_32F)\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+
+void test_check_MatDepth_2(int src_depth)
+{
+    CV_CheckDepth(src_depth, src_depth == CV_32F || src_depth == CV_64F, "Unsupported src");
+}
+TEST(Core_Check, testMatDepth_fail_2)
+{
+    try
+    {
+        test_check_MatDepth_2(CV_8U);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Unsupported src:\n"
+">     'src_depth == CV_32F || src_depth == CV_64F'\n"
+"> where\n>     'src_depth' is 0 (CV_8U)\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+
+
 }} // namespace
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 2cdf70086c..90538424a6 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -559,7 +559,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     {
     public:
         float pnorm, epsilon;
-        bool acrossSpatial;
+        CV_DEPRECATED bool acrossSpatial;
 
         static Ptr<NormalizeBBoxLayer> create(const LayerParams& params);
     };
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index f1e220c7b3..7f8c7e7499 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -80,7 +80,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     enum Target
     {
         DNN_TARGET_CPU,
-        DNN_TARGET_OPENCL
+        DNN_TARGET_OPENCL,
+        DNN_TARGET_OPENCL_FP16
     };
 
     /** @brief This class provides all data needed to initialize layer.
diff --git a/modules/dnn/misc/quantize_face_detector.py b/modules/dnn/misc/quantize_face_detector.py
index 06acae976b..a9348c2144 100644
--- a/modules/dnn/misc/quantize_face_detector.py
+++ b/modules/dnn/misc/quantize_face_detector.py
@@ -318,6 +318,7 @@ for node in graph_def.node:
         node.input.pop()
         node.input.pop()
         node.input.append(layer_256_1_relu1.name)
+        node.input.append('conv4_3_norm/l2_normalize/Sum/reduction_indices')
         break
 
 softmaxShape = NodeDef()
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index 92719a8e39..12a2081685 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -13,7 +13,7 @@
 namespace opencv_test {
 
 CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE)
-CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL)
+CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)
 
 class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<DNNBackend, DNNTarget> >
 {
@@ -41,8 +41,6 @@ public:
                 throw cvtest::SkipTestException("OpenCL is not available/disabled in OpenCV");
             }
         }
-        if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
-            throw SkipTestException("Skip OpenCL target of Inference Engine backend");
 
         randu(input, 0.0f, 1.0f);
 
@@ -89,24 +87,32 @@ public:
 
 PERF_TEST_P_(DNNTestNetwork, AlexNet)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
             "alexnet.yml", Mat(cv::Size(227, 227), CV_32FC3));
 }
 
 PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
     processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
             "", Mat(cv::Size(224, 224), CV_32FC3));
 }
 
 PERF_TEST_P_(DNNTestNetwork, ResNet_50)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
     processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
             "resnet_50.yml", Mat(cv::Size(224, 224), CV_32FC3));
 }
 
 PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
     processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
             "squeezenet_v1_1.yml", Mat(cv::Size(227, 227), CV_32FC3));
 }
@@ -135,14 +141,18 @@ PERF_TEST_P_(DNNTestNetwork, SSD)
 
 PERF_TEST_P_(DNNTestNetwork, OpenFace)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     processNet("dnn/openface_nn4.small2.v1.t7", "", "",
             Mat(cv::Size(96, 96), CV_32FC3));
 }
 
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
             Mat(cv::Size(300, 300), CV_32FC3));
 }
@@ -150,7 +160,8 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_TensorFlow)
 {
     if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL ||
-        backend == DNN_BACKEND_HALIDE)
+        backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
         throw SkipTestException("");
     processNet("dnn/ssd_mobilenet_v1_coco.pb", "ssd_mobilenet_v1_coco.pbtxt", "",
             Mat(cv::Size(300, 300), CV_32FC3));
@@ -158,7 +169,9 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_TensorFlow)
 
 PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
     processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", "",
                Mat(cv::Size(224, 224), CV_32FC3));
 }
@@ -189,7 +202,7 @@ PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
 PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
 {
     if (backend == DNN_BACKEND_HALIDE ||
-        backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL)
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
         throw SkipTestException("");
     processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", "",
                Mat(cv::Size(300, 300), CV_32FC3));
@@ -197,7 +210,9 @@ PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
 
 PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
             Mat(cv::Size(300, 300), CV_32FC3));
 }
@@ -209,6 +224,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
 #endif
 #ifdef HAVE_INF_ENGINE
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
 #endif
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
diff --git a/modules/dnn/src/caffe/opencv-caffe.proto b/modules/dnn/src/caffe/opencv-caffe.proto
index 41cd46bb1c..380b51f281 100644
--- a/modules/dnn/src/caffe/opencv-caffe.proto
+++ b/modules/dnn/src/caffe/opencv-caffe.proto
@@ -50,7 +50,7 @@ syntax = "proto2";
 
 package opencv_caffe;
 
-// NVidia's Caffe feature is used to store fp16 weights, https://github.com/NVIDIA/caffe:
+// NVIDIA's Caffe feature is used to store fp16 weights, https://github.com/NVIDIA/caffe:
 // Math and storage types
 enum Type {
   DOUBLE = 0;
@@ -72,10 +72,10 @@ message BlobProto {
   repeated double double_data = 8 [packed = true];
   repeated double double_diff = 9 [packed = true];
 
-  // NVidia's Caffe fields begin.
+  // NVIDIA's Caffe fields begin.
   optional Type raw_data_type = 10;
   optional bytes raw_data = 12 [packed = false];
-  // NVidia's Caffe fields end.
+  // NVIDIA's Caffe fields end.
 
   // 4D dimensions -- deprecated.  Use "shape" instead.
   optional int32 num = 1 [default = 0];
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 611e35e386..d82e7cd7ce 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1154,7 +1154,7 @@ struct Net::Impl
                 ld.skip = true;
             }
             layers[lastLayerId].skip = false;
-            ieNode->net->init();
+            ieNode->net->init(preferableTarget);
             return;
         }
 
@@ -1167,17 +1167,17 @@ struct Net::Impl
         for (it = layers.begin(); it != layers.end(); ++it)
         {
             LayerData &ld = it->second;
-            ld.skip = true;  // Initially skip all Inference Engine supported layers.
-            Ptr<Layer> layer = ld.layerInstance;
+            bool fused = ld.skip && ld.id != 0;
 
+            Ptr<Layer> layer = ld.layerInstance;
             if (!layer->supportBackend(preferableBackend))
             {
                 addInfEngineNetOutputs(ld);
-                ld.skip = false;
                 net = Ptr<InfEngineBackendNet>();
                 netBlobsWrappers.clear();
                 continue;
             }
+            ld.skip = true;  // Initially skip all Inference Engine supported layers.
 
             // Create a new network if one of inputs from different Inference Engine graph.
             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
@@ -1217,19 +1217,16 @@ struct Net::Impl
             }
             netBlobsWrappers[ld.id] = ld.outputBlobsWrappers[0];
 
-            bool fused = false;
             Ptr<BackendNode> node;
             if (!net.empty())
             {
-                // Try to fuse.
-                bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
-                               ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
-                if (inPlace)
+                if (fused)
                 {
-                    node = layer->tryAttach(layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend]);
-                    fused = !node.empty();
-                    if (fused)
-                        ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
+                    bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
+                                   ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
+                    CV_Assert(inPlace);
+                    node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
+                    ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
                 }
             }
             else
@@ -1247,6 +1244,19 @@ struct Net::Impl
             CV_Assert(!ieNode.empty());
             ieNode->net = net;
 
+            if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !fused)
+            {
+                ieNode->layer->precision = InferenceEngine::Precision::FP16;
+                auto weightableLayer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(ieNode->layer);
+                if (weightableLayer)
+                {
+                    if (weightableLayer->_weights)
+                        weightableLayer->_weights = convertFp16(weightableLayer->_weights);
+                    if (weightableLayer->_biases)
+                        weightableLayer->_biases = convertFp16(weightableLayer->_biases);
+                }
+            }
+
             ieNode->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers);
             net->addBlobs(ld.inputBlobsWrappers);
             net->addBlobs(ld.outputBlobsWrappers);
@@ -1276,7 +1286,7 @@ struct Net::Impl
 
             if (!ieNode->net->isInitialized())
             {
-                ieNode->net->init();
+                ieNode->net->init(preferableTarget);
                 ld.skip = false;
             }
         }
@@ -1380,7 +1390,8 @@ struct Net::Impl
 
     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
     {
-        if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT)
+        if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT &&
+                       preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
             return;
 
         CV_TRACE_FUNCTION();
@@ -1407,7 +1418,7 @@ struct Net::Impl
             // some other layers.
 
             // TODO: OpenCL target support more fusion styles.
-            if ( preferableTarget == DNN_TARGET_OPENCL &&
+            if ( preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL &&
                  (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
                  ld.layerInstance->type != "MVN")) )
                 continue;
@@ -1442,6 +1453,9 @@ struct Net::Impl
                         break;
                 }
 
+                if (preferableBackend != DNN_BACKEND_DEFAULT)
+                    continue;  // Go to the next layer.
+
                 // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
                 if ( preferableTarget != DNN_TARGET_OPENCL ||
                         (preferableTarget == DNN_TARGET_OPENCL &&
@@ -1583,6 +1597,9 @@ struct Net::Impl
                 }
             }
 
+            if (preferableBackend != DNN_BACKEND_DEFAULT)
+                continue;  // Go to the next layer.
+
             // the optimization #2. if there is no layer that takes max pooling layer's computed
             // max indices (and only some semantical segmentation networks might need this;
             // many others only take the maximum values), then we switch the max pooling
@@ -1944,7 +1961,8 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
         ld.layerInstance = Ptr<Layer>(new InfEngineBackendLayer(it.second));
         ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;
 
-        cvNet.connect(0, 0, lid, 0);
+        for (int i = 0; i < inputsNames.size(); ++i)
+            cvNet.connect(0, i, lid, i);
     }
     cvNet.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
 
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index df4e553309..c2906b63f5 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -234,19 +234,6 @@ public:
 #endif  // HAVE_HALIDE
                 break;
             }
-            case DNN_BACKEND_INFERENCE_ENGINE:
-            {
-#ifdef HAVE_INF_ENGINE
-                auto base = node.dynamicCast<InfEngineBackendNode>();
-                auto conv = std::dynamic_pointer_cast<InferenceEngine::ConvolutionLayer>(base->layer);
-                if (conv)
-                {
-                    fuseConvWeights(conv, weights_, bias_);
-                    return base;
-                }
-#endif  // HAVE_INF_ENGINE
-                break;
-            }
         }
         return Ptr<BackendNode>();
     }
@@ -287,8 +274,9 @@ public:
         lp.precision = InferenceEngine::Precision::FP32;
         std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
 
-        ieLayer->_weights = wrapToInfEngineBlob(weights_);
-        ieLayer->_biases = wrapToInfEngineBlob(bias_);
+        const int numChannels = weights_.total();
+        ieLayer->_weights = wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C);
+        ieLayer->_biases = wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C);
 
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 1ed17bb357..0794eff9af 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -40,6 +40,7 @@
 //
 //M*/
 #include "../precomp.hpp"
+#include "../op_inf_engine.hpp"
 
 namespace cv
 {
@@ -53,6 +54,12 @@ public:
         setParamsFrom(params);
     }
 
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_DEFAULT ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
+    }
+
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
                          const int requiredOutputs,
                          std::vector<MatShape> &outputs,
@@ -104,6 +111,19 @@ public:
             if (outputs[i].data != inputs[i]->data)
                 inputs[i]->copyTo(outputs[i]);
     }
+
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "Split";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::SplitLayer> ieLayer(new InferenceEngine::SplitLayer(lp));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
 };
 
 Ptr<Layer> BlankLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 6da843809e..8c52bc0aae 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -173,21 +173,21 @@ public:
     std::vector<float> biasvec;
     std::vector<float> reluslope;
     Ptr<ActivationLayer> activ;
+    bool newWeightAndBias;
+    bool fusedBias;
 
 #ifdef HAVE_OPENCL
     Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
     std::vector<UMat> umat_blobs;
-    bool fusedBias;
-    bool newWeightAndBias;
     bool newActiv;
     ocl4dnnFusedActiv_t activType;
     float power;
 #endif
     ConvolutionLayerImpl(const LayerParams &params) : BaseConvolutionLayerImpl(params)
     {
-#ifdef HAVE_OPENCL
-        fusedBias = false;
         newWeightAndBias = false;
+        fusedBias = false;
+#ifdef HAVE_OPENCL
         newActiv = false;
         activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
         power = 0.f;
@@ -350,10 +350,8 @@ public:
                 biasvec[i] += b.at<float>(i);
         }
 
-#ifdef HAVE_OPENCL
         newWeightAndBias = !w.empty() || !b.empty();
         fusedBias = hasBias() || !b.empty();
-#endif
         biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
     }
 
@@ -433,9 +431,31 @@ public:
         ieLayer->_dilation_y = dilation.height;
         ieLayer->_group = group;
 
-        ieLayer->_weights = wrapToInfEngineBlob(blobs[0]);
-        if (hasBias())
-            ieLayer->_biases = wrapToInfEngineBlob(blobs[1]);
+        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
+        if (newWeightAndBias)
+        {
+            if (weightsMat.isContinuous())
+            {
+                Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size);
+                ieLayer->_weights = wrapToInfEngineBlob(fusedWeights, InferenceEngine::Layout::OIHW);
+            }
+            else
+            {
+                ieLayer->_weights = InferenceEngine::make_shared_blob<float>(
+                                    InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW,
+                                    ieLayer->_weights->dims());
+                ieLayer->_weights->allocate();
+
+                Mat newWeights = infEngineBlobToMat(ieLayer->_weights).reshape(1, outCn);
+                Mat fusedWeights = weightsMat.colRange(0, newWeights.cols);
+                fusedWeights.copyTo(newWeights);
+            }
+        }
+        if (hasBias() || fusedBias)
+        {
+            Mat biasesMat({outCn}, CV_32F, &biasvec[0]);
+            ieLayer->_biases = wrapToInfEngineBlob(biasesMat, {outCn}, InferenceEngine::Layout::C);
+        }
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 68ca1b4f5f..9ee7e98023 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -412,9 +412,9 @@ public:
         std::shared_ptr<InferenceEngine::FullyConnectedLayer> ieLayer(new InferenceEngine::FullyConnectedLayer(lp));
 
         ieLayer->_out_num = blobs[0].size[0];
-        ieLayer->_weights = wrapToInfEngineBlob(blobs[0]);
+        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], {blobs[0].size[0], blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW);
         if (blobs.size() > 1)
-            ieLayer->_biases = wrapToInfEngineBlob(blobs[1]);
+            ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {ieLayer->_out_num}, InferenceEngine::Layout::C);
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
         return Ptr<BackendNode>();
diff --git a/modules/dnn/src/layers/layers_common.cpp b/modules/dnn/src/layers/layers_common.cpp
index f854d91ccc..bf5834c864 100644
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@@ -77,7 +77,20 @@ bool getParameter(const LayerParams &params, const std::string& nameBase, const
     {
         if (params.has(nameAll_))
         {
-            parameterH = parameterW = params.get<int>(nameAll_);
+            DictValue param = params.get(nameAll_);
+            parameterH = param.get<int>(0);
+            if (param.size() == 1)
+            {
+                parameterW = parameterH;
+            }
+            else if (param.size() == 2)
+            {
+                parameterW = param.get<int>(1);
+            }
+            else
+            {
+                return false;
+            }
             return true;
         }
         else
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index 5772aad91c..5e8ed65157 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -42,6 +42,7 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "../op_inf_engine.hpp"
 
 namespace cv { namespace dnn {
 
@@ -54,9 +55,19 @@ public:
         pnorm = params.get<float>("p", 2);
         epsilon = params.get<float>("eps", 1e-10f);
         acrossSpatial = params.get<bool>("across_spatial", true);
+        startAxis = params.get<int>("start_axis", 1);
+        CV_Assert(!params.has("across_spatial") || !params.has("end_axis"));
+        endAxis = params.get<int>("end_axis", acrossSpatial ? -1 : startAxis);
         CV_Assert(pnorm > 0);
     }
 
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_DEFAULT ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() &&
+               pnorm == 2 && !blobs.empty();
+    }
+
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
                          const int requiredOutputs,
                          std::vector<MatShape> &outputs,
@@ -85,20 +96,26 @@ public:
 
         const UMat& inp0 = inputs[0];
         UMat& buffer = internals[0];
-        size_t num = inp0.size[0];
-        size_t channels = inp0.size[1];
-        size_t channelSize = inp0.total() / (num * channels);
+        startAxis = clamp(startAxis, inp0.dims);
+        endAxis = clamp(endAxis, inp0.dims);
+
+        size_t num = total(shape(inp0.size), 0, startAxis);
+        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        size_t planeSize = inp0.total() / (num * numPlanes);
+        MatShape s = shape(1, inputs[0].total());
+        UMat inp = inputs[0].reshape(1, s.size(), &s[0]).reshape(1, num);
+        UMat out = outputs[0].reshape(1, s.size(), &s[0]).reshape(1, num);
         for (size_t i = 0; i < num; ++i)
         {
-            MatShape s = shape(channels, channelSize);
-            UMat src = inputs[i].reshape(1, s.size(), &s[0]);
-            UMat dst = outputs[i].reshape(1, s.size(), &s[0]);
+            s = shape(numPlanes, planeSize);
+            UMat src = inp.row(i).reshape(1, s.size(), &s[0]);
+            UMat dst = out.row(i).reshape(1, s.size(), &s[0]);
 
             UMat abs_mat;
             absdiff(src, cv::Scalar::all(0), abs_mat);
             pow(abs_mat, pnorm, buffer);
 
-            if (acrossSpatial)
+            if (planeSize == 1)
             {
                 // add eps to avoid overflow
                 float absSum = sum(buffer)[0] + epsilon;
@@ -114,7 +131,7 @@ public:
                 // compute inverted norm to call multiply instead divide
                 cv::pow(norm, -1.0f / pnorm, norm);
 
-                repeat(norm, channels, 1, buffer);
+                repeat(norm, numPlanes, 1, buffer);
                 multiply(src, buffer, dst);
             }
 
@@ -130,7 +147,7 @@ public:
                 else
                 {
                     // _scale: _channels x 1
-                    CV_Assert(scale.total() == channels);
+                    CV_Assert(scale.total() == numPlanes);
                     repeat(scale, 1, dst.cols, buffer);
                     multiply(dst, buffer, dst);
                 }
@@ -162,17 +179,22 @@ public:
 
         const Mat& inp0 = *inputs[0];
         Mat& buffer = internals[0];
-        size_t num = inp0.size[0];
-        size_t channels = inp0.size[1];
-        size_t channelSize = inp0.total() / (num * channels);
+        startAxis = clamp(startAxis, inp0.dims);
+        endAxis = clamp(endAxis, inp0.dims);
+
+        const float* inpData = inp0.ptr<float>();
+        float* outData = outputs[0].ptr<float>();
+
+        size_t num = total(shape(inp0.size), 0, startAxis);
+        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        size_t planeSize = inp0.total() / (num * numPlanes);
         for (size_t n = 0; n < num; ++n)
         {
-            Mat src = Mat(channels, channelSize, CV_32F, (void*)inp0.ptr<float>(n));
-            Mat dst = Mat(channels, channelSize, CV_32F, (void*)outputs[0].ptr<float>(n));
-
+            Mat src = Mat(numPlanes, planeSize, CV_32F, (void*)inpData);
+            Mat dst = Mat(numPlanes, planeSize, CV_32F, (void*)outData);
             cv::pow(abs(src), pnorm, buffer);
 
-            if (acrossSpatial)
+            if (planeSize == 1)
             {
                 // add eps to avoid overflow
                 float absSum = sum(buffer)[0] + epsilon;
@@ -188,7 +210,7 @@ public:
                 // compute inverted norm to call multiply instead divide
                 cv::pow(norm, -1.0f / pnorm, norm);
 
-                repeat(norm, channels, 1, buffer);
+                repeat(norm, numPlanes, 1, buffer);
                 multiply(src, buffer, dst);
             }
 
@@ -204,13 +226,40 @@ public:
                 else
                 {
                     // _scale: _channels x 1
-                    CV_Assert(scale.total() == channels);
+                    CV_Assert(scale.total() == numPlanes);
                     repeat(scale, 1, dst.cols, buffer);
                     multiply(dst, buffer, dst);
                 }
             }
+            inpData += numPlanes * planeSize;
+            outData += numPlanes * planeSize;
         }
     }
+
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "Normalize";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
+
+        CV_Assert(!blobs.empty());
+
+        ieLayer->params["eps"] = format("%f", epsilon);
+        ieLayer->params["across_spatial"] = acrossSpatial ? "1" : "0";
+        ieLayer->params["channel_shared"] = blobs[0].total() == 1 ? "1" : "0";
+
+        const int numChannels = blobs[0].total();
+        ieLayer->blobs["weights"] = wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
+private:
+    int startAxis, endAxis;
 };
 
 
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 464e3850a8..833c993ae1 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -132,20 +132,6 @@ public:
 #endif  // HAVE_HALIDE
                 break;
             }
-            case DNN_BACKEND_INFERENCE_ENGINE:
-            {
-#ifdef HAVE_INF_ENGINE
-                auto base = node.dynamicCast<InfEngineBackendNode>();
-                auto conv = std::dynamic_pointer_cast<InferenceEngine::ConvolutionLayer>(base->layer);
-                if (conv)
-                {
-                    Mat bias = hasBias ? blobs[1] : Mat();
-                    fuseConvWeights(conv, blobs[0], bias);
-                    return base;
-                }
-#endif  // HAVE_INF_ENGINE
-                break;
-            }
         }
         return Ptr<BackendNode>();
     }
@@ -192,9 +178,10 @@ public:
         lp.precision = InferenceEngine::Precision::FP32;
         std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
 
-        ieLayer->_weights = wrapToInfEngineBlob(blobs[0]);
+        const int numChannels = blobs[0].total();
+        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C);
         if (hasBias)
-            ieLayer->_biases = wrapToInfEngineBlob(blobs[1]);
+            ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {numChannels}, InferenceEngine::Layout::C);
 
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
diff --git a/modules/dnn/src/layers/shift_layer.cpp b/modules/dnn/src/layers/shift_layer.cpp
index fbbdcb1ea9..7c3bb14df9 100644
--- a/modules/dnn/src/layers/shift_layer.cpp
+++ b/modules/dnn/src/layers/shift_layer.cpp
@@ -90,27 +90,6 @@ public:
         }
     }
 
-    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
-    {
-        switch (node->backendId)
-        {
-            case DNN_BACKEND_INFERENCE_ENGINE:
-            {
-#ifdef HAVE_INF_ENGINE
-                auto base = node.dynamicCast<InfEngineBackendNode>();
-                auto conv = std::dynamic_pointer_cast<InferenceEngine::ConvolutionLayer>(base->layer);
-                if (conv)
-                {
-                    fuseConvWeights(conv, Mat(), blobs[0]);
-                    return base;
-                }
-#endif  // HAVE_INF_ENGINE
-                break;
-            }
-        }
-        return Ptr<BackendNode>();
-    }
-
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
     {
 #ifdef HAVE_INF_ENGINE
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index cad27ce3b3..129ed94a5d 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -18,6 +18,11 @@ namespace cv { namespace dnn {
 
 #ifdef HAVE_INF_ENGINE
 
+static int infEngineVersion()
+{
+    return std::atoi(InferenceEngine::GetInferenceEngineVersion()->buildNumber);
+}
+
 InfEngineBackendNode::InfEngineBackendNode(const InferenceEngine::CNNLayerPtr& _layer)
     : BackendNode(DNN_BACKEND_INFERENCE_ENGINE), layer(_layer) {}
 
@@ -58,23 +63,37 @@ static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std:
 {
     std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
     std::reverse(reversedShape.begin(), reversedShape.end());
-    return InferenceEngine::DataPtr(
-      new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32,
-                                InferenceEngine::Layout::ANY)
-    );
+    if (infEngineVersion() > 5855)
+    {
+        InferenceEngine::Layout l = InferenceEngine::Layout::ANY;
+        if (m.dims == 4)
+            l = InferenceEngine::Layout::NCHW;
+        else if (m.dims == 2)
+            l = InferenceEngine::Layout::NC;
+        return InferenceEngine::DataPtr(
+            new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, l)
+        );
+    }
+    else
+    {
+        return InferenceEngine::DataPtr(
+            new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32)
+        );
+    }
 }
 
-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape)
+InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape,
+                                                       InferenceEngine::Layout layout)
 {
     return InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
-                                                    shape, (float*)m.data);
+                                                    layout, shape, (float*)m.data);
 }
 
-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m)
+InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout)
 {
     std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
     std::reverse(reversedShape.begin(), reversedShape.end());
-    return wrapToInfEngineBlob(m, reversedShape);
+    return wrapToInfEngineBlob(m, reversedShape, layout);
 }
 
 InferenceEngine::DataPtr infEngineDataNode(const Ptr<BackendWrapper>& ptr)
@@ -109,10 +128,14 @@ void InfEngineBackendWrapper::setHostDirty()
 
 InfEngineBackendNet::InfEngineBackendNet()
 {
+    targetDevice = InferenceEngine::TargetDevice::eCPU;
+    precision = InferenceEngine::Precision::FP32;
 }
 
 InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net)
 {
+    targetDevice = InferenceEngine::TargetDevice::eCPU;
+    precision = InferenceEngine::Precision::FP32;
     inputs = net.getInputsInfo();
     outputs = net.getOutputsInfo();
     layers.resize(net.layerCount());  // A hack to execute InfEngineBackendNet::layerCount correctly.
@@ -126,9 +149,14 @@ void InfEngineBackendNet::Release() noexcept
     outputs.clear();
 }
 
+void InfEngineBackendNet::setPrecision(InferenceEngine::Precision p) noexcept
+{
+    precision = p;
+}
+
 InferenceEngine::Precision InfEngineBackendNet::getPrecision() noexcept
 {
-    return InferenceEngine::Precision::FP32;
+    return precision;
 }
 
 // Assume that outputs of network is unconnected blobs.
@@ -161,9 +189,8 @@ InferenceEngine::InputInfo::Ptr InfEngineBackendNet::getInput(const std::string
     return it->second;
 }
 
-void InfEngineBackendNet::getName(char *pName, size_t len) noexcept
+void InfEngineBackendNet::getName(char*, size_t) noexcept
 {
-    CV_Error(Error::StsNotImplemented, "");
 }
 
 size_t InfEngineBackendNet::layerCount() noexcept
@@ -213,13 +240,15 @@ InfEngineBackendNet::getLayerByName(const char *layerName, InferenceEngine::CNNL
 
 void InfEngineBackendNet::setTargetDevice(InferenceEngine::TargetDevice device) noexcept
 {
-    if (device != InferenceEngine::TargetDevice::eCPU)
+    if (device != InferenceEngine::TargetDevice::eCPU &&
+        device != InferenceEngine::TargetDevice::eGPU)
         CV_Error(Error::StsNotImplemented, "");
+    targetDevice = device;
 }
 
 InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() noexcept
 {
-    return InferenceEngine::TargetDevice::eCPU;
+    return targetDevice;
 }
 
 InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t size) noexcept
@@ -234,7 +263,7 @@ size_t InfEngineBackendNet::getBatchSize() const noexcept
     return 0;
 }
 
-void InfEngineBackendNet::init()
+void InfEngineBackendNet::init(int targetId)
 {
     if (inputs.empty())
     {
@@ -307,6 +336,15 @@ void InfEngineBackendNet::init()
         outBlobs[it.first] = allBlobs[it.first];
     }
 
+    switch (targetId)
+    {
+    case DNN_TARGET_CPU: setTargetDevice(InferenceEngine::TargetDevice::eCPU); break;
+    case DNN_TARGET_OPENCL_FP16: setPrecision(InferenceEngine::Precision::FP16);  // Fallback to the next.
+    case DNN_TARGET_OPENCL: setTargetDevice(InferenceEngine::TargetDevice::eGPU); break;
+    default:
+        CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId));
+    }
+
     if (!isInitialized())
         initPlugin(*this);
 }
@@ -317,10 +355,9 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
 
     InferenceEngine::StatusCode status;
     InferenceEngine::ResponseDesc resp;
-    const InferenceEngine::Version* v = InferenceEngine::GetInferenceEngineVersion();
 
-    plugin = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(InferenceEngine::TargetDevice::eCPU);
-    if (std::atoi(v->buildNumber) > 5855)
+    plugin = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(targetDevice);
+    if (infEngineVersion() > 5855 && targetDevice == InferenceEngine::TargetDevice::eCPU)
     {
 #ifdef _WIN32
         InferenceEngine::IExtensionPtr extension =
@@ -360,7 +397,7 @@ void InfEngineBackendNet::forward()
         CV_Error(Error::StsAssert, resp.msg);
 }
 
-static inline Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
+Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
 {
     // NOTE: Inference Engine sizes are reversed.
     std::vector<size_t> dims = blob->dims();
@@ -369,56 +406,6 @@ static inline Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
     return Mat(size, CV_32F, (void*)blob->buffer());
 }
 
-void fuseConvWeights(const std::shared_ptr<InferenceEngine::ConvolutionLayer>& conv,
-                     const Mat& w, const Mat& b)
-{
-    CV_Assert(!w.empty() || !b.empty());
-    if (!w.empty())
-    {
-        // Get convolution's weights. Clone the data because Inference Engine can host it
-        // and conv->_weights->allocate() below will deallocate it.
-        Mat originWeights = infEngineBlobToMat(conv->_weights).clone();
-
-        // Create new weights blob.
-        conv->_weights = InferenceEngine::make_shared_blob<float>(
-                            InferenceEngine::Precision::FP32, conv->_weights->dims());
-        conv->_weights->allocate();
-
-        // Convolution weights have OIHW data layout.
-        // (conv(I) + b1 ) * w + b2
-        // w*conv(I) + b1 * w + b2
-        Mat fusedWeights = infEngineBlobToMat(conv->_weights);
-
-        const int numChannels = fusedWeights.size[0];
-        // Mat weights = blobs[0].reshape(1, 1);
-        // Mat bias = hasBias ? blobs[1].reshape(1, 1) : Mat();
-        CV_Assert(numChannels == w.total());
-        CV_Assert(b.empty() || numChannels == b.total());
-        for (int i = 0; i < numChannels; ++i)
-        {
-            cv::multiply(slice(originWeights, i), w.at<float>(i), slice(fusedWeights, i));
-        }
-    }
-    if (conv->_biases)
-    {
-        // The same for biases.
-        Mat originBiases = infEngineBlobToMat(conv->_biases).clone();
-
-        conv->_biases = InferenceEngine::make_shared_blob<float>(
-                            InferenceEngine::Precision::FP32, conv->_biases->dims());
-        conv->_biases->allocate();
-        Mat fusedBiases = infEngineBlobToMat(conv->_biases);
-        originBiases.copyTo(fusedBiases);
-
-        if (!w.empty())
-            cv::multiply(w.reshape(1, fusedBiases.dims, &fusedBiases.size[0]), fusedBiases, fusedBiases);
-        if (!b.empty())
-            cv::add(fusedBiases, b.reshape(1, fusedBiases.dims, &fusedBiases.size[0]), fusedBiases);
-    }
-    else
-        conv->_biases = wrapToInfEngineBlob(b);
-}
-
 InfEngineBackendLayer::InfEngineBackendLayer(const InferenceEngine::DataPtr& output_)
 {
     output = output_;
@@ -454,6 +441,16 @@ void InfEngineBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArra
     CV_Error(Error::StsInternal, "Choose Inference Engine as a preferable backend.");
 }
 
+InferenceEngine::TBlob<int16_t>::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob)
+{
+    auto halfs = InferenceEngine::make_shared_blob<int16_t>(InferenceEngine::Precision::FP16, blob->layout(), blob->dims());
+    halfs->allocate();
+    Mat floatsData(1, blob->size(), CV_32F, blob->buffer());
+    Mat halfsData(1, blob->size(), CV_16SC1, halfs->buffer());
+    convertFp16(floatsData, halfsData);
+    return halfs;
+}
+
 #endif  // HAVE_INF_ENGINE
 
 bool haveInfEngine()
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index 438463586c..67dadd3066 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -32,6 +32,8 @@ public:
 
     virtual void Release() noexcept CV_OVERRIDE;
 
+    void setPrecision(InferenceEngine::Precision p) noexcept;
+
     virtual InferenceEngine::Precision getPrecision() noexcept CV_OVERRIDE;
 
     virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) noexcept /*CV_OVERRIDE*/;
@@ -68,7 +70,7 @@ public:
 
     virtual size_t getBatchSize() const noexcept CV_OVERRIDE;
 
-    void init();
+    void init(int targetId);
 
     void addBlobs(const std::vector<Ptr<BackendWrapper> >& wrappers);
 
@@ -83,6 +85,8 @@ private:
     InferenceEngine::BlobMap inpBlobs;
     InferenceEngine::BlobMap outBlobs;
     InferenceEngine::BlobMap allBlobs;
+    InferenceEngine::TargetDevice targetDevice;
+    InferenceEngine::Precision precision;
     InferenceEngine::InferenceEnginePluginPtr plugin;
 
     void initPlugin(InferenceEngine::ICNNNetwork& net);
@@ -116,15 +120,17 @@ public:
     InferenceEngine::TBlob<float>::Ptr blob;
 };
 
-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m);
+InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY);
 
-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape);
+InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape, InferenceEngine::Layout layout);
 
 InferenceEngine::DataPtr infEngineDataNode(const Ptr<BackendWrapper>& ptr);
 
-// Fuses convolution weights and biases with channel-wise scales and shifts.
-void fuseConvWeights(const std::shared_ptr<InferenceEngine::ConvolutionLayer>& conv,
-                     const Mat& w, const Mat& b = Mat());
+Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob);
+
+// Convert Inference Engine blob with FP32 precision to FP16 precision.
+// Allocates memory for a new blob.
+InferenceEngine::TBlob<int16_t>::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob);
 
 // This is a fake class to run networks from Model Optimizer. Objects of that
 // class simulate responses of layers are imported by OpenCV and supported by
@@ -151,7 +157,6 @@ private:
     InferenceEngine::DataPtr output;
 };
 
-
 #endif  // HAVE_INF_ENGINE
 
 bool haveInfEngine();
diff --git a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
index 2cfb42fa9b..cfb472ec00 100644
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@@ -80,14 +80,16 @@ public:
     {
         CV_Assert(inpId < node.input_size());
         std::string name = node.input(inpId);
+        // If operation produces several tensors, they are specified by index
+        // after ':' character. In example, "input:0".
+        name = name.substr(0, name.rfind(':'));
         const int numNodes = net.node_size();
         for (int i = 0; i < numNodes; ++i)
         {
             if (net.node(i).name() == name)
                 return net.node(i);
         }
-        CV_Error(Error::StsParseError, "Input node with name " + name + " not found");
-        return net.node(0);  // just return something
+        CV_ErrorNoReturn(Error::StsParseError, "Input node with name " + name + " not found");
     }
 
     // Match TensorFlow subgraph starting from <nodeId> with a set of nodes to be fused.
@@ -400,6 +402,23 @@ private:
     int numOutDims;
 };
 
+class L2NormalizeSubgraph : public Subgraph
+{
+public:
+    L2NormalizeSubgraph()
+    {
+        int input = addNodeToMatch("");
+        int square = addNodeToMatch("Square", input);
+        int reductionIndices = addNodeToMatch("Const");
+        int sum = addNodeToMatch("Sum", square, reductionIndices);
+        int y = addNodeToMatch("Const");
+        int maximum = addNodeToMatch("Maximum", sum, y);
+        int rsqrt = addNodeToMatch("Rsqrt", maximum);
+        addNodeToMatch("Mul", input, rsqrt);
+        setFusedNode("L2Normalize", input, reductionIndices);
+    }
+};
+
 void simplifySubgraphs(tensorflow::GraphDef& net)
 {
     std::vector<Ptr<Subgraph> > subgraphs;
@@ -410,6 +429,7 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
     subgraphs.push_back(Ptr<Subgraph>(new SoftMaxKerasSubgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new ReLU6KerasSubgraph()));
     subgraphs.push_back(Ptr<Subgraph>(new ReshapeKerasSubgraph(3)));
+    subgraphs.push_back(Ptr<Subgraph>(new L2NormalizeSubgraph()));
 
     int numNodes = net.node_size();
     std::vector<int> matchedNodesIds;
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 6ea9e96a47..f5809163bd 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -37,7 +37,13 @@ using ::google::protobuf::Reflection;
 namespace
 {
 
-static int toNCHW[] = {0, 2, 3, 1};
+static int toNCHW(int idx)
+{
+    CV_Assert(-4 <= idx && idx < 4);
+    if (idx == 0) return 0;
+    else if (idx > 0) return idx % 3 + 1;
+    else return (4 + idx) % 3 + 1;
+}
 
 // This values are used to indicate layer output's data layout where it's possible.
 enum DataLayout
@@ -556,11 +562,23 @@ static void addConstNodes(tensorflow::GraphDef& net, std::map<String, int>& cons
 // this layer's output has this data layout too. Returns DATA_LAYOUT_UNKNOWN otherwise.
 static int predictOutputDataLayout(const tensorflow::NodeDef& layer, const std::map<String, int>& data_layouts)
 {
+    if (hasLayerAttr(layer, "data_format"))
+    {
+        std::string format = getLayerAttr(layer, "data_format").s();
+        if (format == "NHWC" || format == "channels_last")
+            return DATA_LAYOUT_NHWC;
+        else if (format == "NCHW" || format == "channels_first")
+            return DATA_LAYOUT_NCHW;
+        else
+            CV_Error(Error::StsParseError, "Unknown data_format value: " + format);
+    }
+
+    // Determine layout by layer's inputs
     int layout = DATA_LAYOUT_UNKNOWN;
     std::map<String, int>::const_iterator it;
     for (int i = 0, n = layer.input_size(); i < n; ++i)
     {
-        it = data_layouts.find(layer.input(i));
+        it = data_layouts.find(layer.input(i).substr(0, layer.input(i).rfind(':')));
         if (it != data_layouts.end())
         {
             if (it->second == DATA_LAYOUT_UNKNOWN)
@@ -708,17 +726,7 @@ void TFImporter::populateNet(Net dstNet)
             // one input only
             connect(layer_id, dstNet, parsePin(input), id, 0);
 
-            if (hasLayerAttr(layer, "data_format"))
-            {
-                std::string format = getLayerAttr(layer, "data_format").s();
-                if (format == "NHWC" || format == "channels_last")
-                    data_layouts[name] = DATA_LAYOUT_NHWC;
-                else if (format == "NCHW" || format == "channels_first")
-                    data_layouts[name] = DATA_LAYOUT_NCHW;
-                else
-                    CV_Error(Error::StsParseError, "Unknown data_format value: " + format);
-            }
-            else
+            if (data_layouts[name] == DATA_LAYOUT_UNKNOWN)
                 data_layouts[name] = DATA_LAYOUT_NHWC;
         }
         else if (type == "BiasAdd" || type == "Add")
@@ -956,7 +964,7 @@ void TFImporter::populateNet(Net dstNet)
         {
             int axisId = (type == "Concat" ? 0 : layer.input_size() - 1);
             int axis = getConstBlob(layer, value_id, axisId).int_val().Get(0);
-            layerParams.set("axis", 0 <= axis && axis < 4 ? toNCHW[axis] : axis);
+            layerParams.set("axis", 0 <= axis && axis < 4 ? toNCHW(axis) : axis);
 
             int id = dstNet.addLayer(name, "Concat", layerParams);
             layer_id[name] = id;
@@ -1017,7 +1025,7 @@ void TFImporter::populateNet(Net dstNet)
             // num_split
             // 1st blob is dims tensor
             int axis = getConstBlob(layer, value_id, 0).int_val().Get(0);
-            layerParams.set("axis", toNCHW[axis]);
+            layerParams.set("axis", toNCHW(axis));
 
             int id = dstNet.addLayer(name, "Slice", layerParams);
             layer_id[name] = id;
@@ -1410,9 +1418,26 @@ void TFImporter::populateNet(Net dstNet)
         {
             // op: "L2Normalize"
             // input: "input"
-            CV_Assert(layer.input_size() == 1);
-            layerParams.set("across_spatial", false);
-            layerParams.set("channel_shared", false);
+            // input: "reduction_indices" (axis)
+            CV_Assert(layer.input_size() == 2);
+            Mat reductionIndices = getTensorContent(getConstBlob(layer, value_id, 1));
+            CV_Assert(reductionIndices.type() == CV_32SC1);
+
+            const int numAxes = reductionIndices.total();
+            if (data_layouts[name] == DATA_LAYOUT_NHWC)
+                for (int i = 0; i < numAxes; ++i)
+                    reductionIndices.at<int>(i) = toNCHW(reductionIndices.at<int>(i));
+
+            cv::sort(reductionIndices, reductionIndices, SORT_ASCENDING);
+            for (int i = 1; i < numAxes; ++i)
+            {
+                CV_Assert(reductionIndices.at<int>(i) == reductionIndices.at<int>(i - 1) + 1);
+                // Axes have the same sign.
+                CV_Assert(reductionIndices.at<int>(i) * reductionIndices.at<int>(i - 1) >= 0);
+            }
+            layerParams.set("start_axis", reductionIndices.at<int>(0));
+            layerParams.set("end_axis", reductionIndices.at<int>(numAxes - 1));
+
             int id = dstNet.addLayer(name, "Normalize", layerParams);
             layer_id[name] = id;
             connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index db657ee644..ea79119910 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -100,6 +100,8 @@ public:
 
 TEST_P(DNNTestNetwork, AlexNet)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
                Size(227, 227), "prob",
                target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_alexnet.yml" :
@@ -108,6 +110,8 @@ TEST_P(DNNTestNetwork, AlexNet)
 
 TEST_P(DNNTestNetwork, ResNet_50)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
     processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
                Size(224, 224), "prob",
                target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_resnet_50.yml" :
@@ -116,6 +120,8 @@ TEST_P(DNNTestNetwork, ResNet_50)
 
 TEST_P(DNNTestNetwork, SqueezeNet_v1_1)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
     processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
                Size(227, 227), "prob",
                target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_squeezenet_v1_1.yml" :
@@ -124,6 +130,8 @@ TEST_P(DNNTestNetwork, SqueezeNet_v1_1)
 
 TEST_P(DNNTestNetwork, GoogLeNet)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
     processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
                Size(224, 224), "prob");
 }
@@ -147,7 +155,9 @@ TEST_P(DNNTestNetwork, ENet)
 
 TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/street.png", false));
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
 
@@ -157,7 +167,9 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
 
 TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/street.png", false));
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
     processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt",
@@ -177,35 +189,45 @@ TEST_P(DNNTestNetwork, SSD_VGG16)
 TEST_P(DNNTestNetwork, OpenPose_pose_coco)
 {
     if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 3e-5 : 1e-5;
+    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 3e-3 : 1e-4;
     processNet("dnn/openpose_pose_coco.caffemodel", "dnn/openpose_pose_coco.prototxt",
-               Size(368, 368), "");
+               Size(368, 368), "", "", l1, lInf);
 }
 
 TEST_P(DNNTestNetwork, OpenPose_pose_mpi)
 {
     if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 4e-5 : 1e-5;
+    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 7e-3 : 1e-4;
     processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt",
-               Size(368, 368), "");
+               Size(368, 368), "", "", l1, lInf);
 }
 
 TEST_P(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
 {
     if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 5e-5 : 1e-5;
+    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 5e-3 : 1e-4;
     // The same .caffemodel but modified .prototxt
     // See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
     processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt",
-               Size(368, 368), "");
+               Size(368, 368), "", "", l1, lInf);
 }
 
 TEST_P(DNNTestNetwork, OpenFace)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     processNet("dnn/openface_nn4.small2.v1.t7", "", Size(96, 96), "");
 }
 
 TEST_P(DNNTestNetwork, opencv_face_detector)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false));
     Mat inp = blobFromImage(img, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
     processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt",
@@ -214,13 +236,23 @@ TEST_P(DNNTestNetwork, opencv_face_detector)
 
 TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/street.png", false));
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
     processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
                inp, "detection_out");
 }
 
+TEST_P(DNNTestNetwork, DenseNet_121)
+{
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
+    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe");
+}
+
 const tuple<DNNBackend, DNNTarget> testCases[] = {
 #ifdef HAVE_HALIDE
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
@@ -228,6 +260,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
 #endif
 #ifdef HAVE_INF_ENGINE
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
 #endif
     tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
 };
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index dd5a06b5af..413e5275c9 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -866,6 +866,44 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy)
 
     normAssert(outDefault, out);
 }
+
+// 1. Create a .prototxt file with the following network:
+// layer {
+//   type: "Input" name: "data" top: "data"
+//   input_param { shape { dim: 1 dim: 2 dim: 3 } }
+// }
+// layer {
+//   type: "Input" name: "second_input" top: "second_input"
+//   input_param { shape { dim: 1 dim: 2 dim: 3 } }
+// }
+// layer {
+//  type: "Eltwise" name: "output" top: "output"
+//  bottom: "data" bottom: "second_input"
+//  eltwise_param { operation: SUM }
+// }
+//
+// 2. Create a .caffemodel file using Caffe:
+//
+// import caffe
+// net = caffe.Net('/path/to/prototxt', caffe.TEST)
+// net.save('/path/to/caffemodel')
+//
+// 3. Convert using ModelOptimizer.
+TEST(Test_DLDT, two_inputs)
+{
+    Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin"));
+    int inpSize[] = {1, 2, 3};
+    Mat firstInp(3, &inpSize[0], CV_32F);
+    Mat secondInp(3, &inpSize[0], CV_32F);
+    randu(firstInp, -1, 1);
+    randu(secondInp, -1, 1);
+
+    net.setInput(firstInp, "data");
+    net.setInput(secondInp, "second_input");
+    Mat out = net.forward();
+
+    normAssert(out, firstInp + secondInp);
+}
 #endif  // HAVE_INF_ENGINE
 
 }} // namespace
diff --git a/modules/dnn/test/test_precomp.hpp b/modules/dnn/test/test_precomp.hpp
index b4bb97dce0..54c9ce6c79 100644
--- a/modules/dnn/test/test_precomp.hpp
+++ b/modules/dnn/test/test_precomp.hpp
@@ -53,7 +53,7 @@ namespace opencv_test {
 using namespace cv::dnn;
 
 CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE)
-CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL)
+CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)
 
 static testing::internal::ParamGenerator<DNNTarget> availableDnnTargets()
 {
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index ff2122856b..8d4f4b6047 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -193,6 +193,13 @@ TEST_P(Test_TensorFlow_layers, reshape)
     runTensorFlowNet("unfused_flatten_unknown_batch", targetId);
 }
 
+TEST_P(Test_TensorFlow_layers, l2_normalize)
+{
+    int targetId = GetParam();
+    runTensorFlowNet("l2_normalize", targetId);
+    runTensorFlowNet("l2_normalize_3d", targetId);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_layers, availableDnnTargets());
 
 typedef testing::TestWithParam<DNNTarget> Test_TensorFlow_nets;
diff --git a/modules/imgproc/perf/opencl/perf_imgwarp.cpp b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
index 7a90e33380..44fb84d1cc 100644
--- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap,
 
     OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
 
-    SANITY_CHECK(dst, eps);
+    SANITY_CHECK_NOTHING();
 }
 
 } } // namespace opencv_test::ocl
diff --git a/modules/imgproc/perf/perf_warp.cpp b/modules/imgproc/perf/perf_warp.cpp
index d0e09a5bab..728a7bba70 100644
--- a/modules/imgproc/perf/perf_warp.cpp
+++ b/modules/imgproc/perf/perf_warp.cpp
@@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,
 
 PERF_TEST_P( TestRemap, remap,
              Combine(
-                 Values( TYPICAL_MAT_TYPES ),
-                 Values( szVGA, sz720p, sz1080p ),
+                 Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
+                 Values( szVGA, sz1080p ),
                  InterType::all(),
                  BorderMode::all(),
                  RemapMode::all()
@@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap,
         remap(source, destination, map_x, map_y, interpolationType, borderMode);
     }
 
-    SANITY_CHECK(destination, 1);
+    SANITY_CHECK_NOTHING();
 }
 
 void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index 16e2278f82..c601ceaf33 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -547,7 +547,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
     size_t src_step = _src.step(), src_offset = _src.offset();
     const size_t tileSizeYmax = wgs / tileSizeX;
 
-    // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory
+    // workaround for NVIDIA: 3 channel vector type takes 4*elem_size in local memory
     int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;
 
     if (((src_offset % src_step) % esz == 0) &&
diff --git a/modules/imgproc/src/fixedpoint.inl.hpp b/modules/imgproc/src/fixedpoint.inl.hpp
index 1c5c88a480..78ba553307 100644
--- a/modules/imgproc/src/fixedpoint.inl.hpp
+++ b/modules/imgproc/src/fixedpoint.inl.hpp
@@ -36,19 +36,25 @@ public:
     typedef fixedpoint64 WT;
     CV_ALWAYS_INLINE fixedpoint64() { val = 0; }
     CV_ALWAYS_INLINE fixedpoint64(const int8_t& _val) { val = ((int64_t)_val) << fixedShift; }
+    CV_ALWAYS_INLINE fixedpoint64(const uint8_t& _val) { val = ((int64_t)_val) << fixedShift; }
     CV_ALWAYS_INLINE fixedpoint64(const int16_t& _val) { val = ((int64_t)_val) << fixedShift; }
+    CV_ALWAYS_INLINE fixedpoint64(const uint16_t& _val) { val = ((int64_t)_val) << fixedShift; }
     CV_ALWAYS_INLINE fixedpoint64(const int32_t& _val) { val = ((int64_t)_val) << fixedShift; }
     CV_ALWAYS_INLINE fixedpoint64(const cv::softdouble& _val) { val = cvRound64(_val * cv::softdouble((int64_t)(1LL << fixedShift))); }
     CV_ALWAYS_INLINE fixedpoint64& operator = (const int8_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
+    CV_ALWAYS_INLINE fixedpoint64& operator = (const uint8_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
     CV_ALWAYS_INLINE fixedpoint64& operator = (const int16_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
+    CV_ALWAYS_INLINE fixedpoint64& operator = (const uint16_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
     CV_ALWAYS_INLINE fixedpoint64& operator = (const int32_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
     CV_ALWAYS_INLINE fixedpoint64& operator = (const cv::softdouble& _val) { val = cvRound64(_val * cv::softdouble((int64_t)(1LL << fixedShift))); return *this; }
     CV_ALWAYS_INLINE fixedpoint64& operator = (const fixedpoint64& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE fixedpoint64 operator * (const ET& val2) const { return val * val2; } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const int8_t& val2) const { return operator *(fixedpoint64(val2)); }
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const uint8_t& val2) const { return operator *(fixedpoint64(val2)); }
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const int16_t& val2) const { return operator *(fixedpoint64(val2)); }
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const uint16_t& val2) const { return operator *(fixedpoint64(val2)); }
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const int32_t& val2) const { return operator *(fixedpoint64(val2)); }
     CV_ALWAYS_INLINE fixedpoint64 operator * (const fixedpoint64& val2) const
     {
-        //Assume -0x00000000C0000000 <= val2 <=0x0000000100000000 INT64_MIN <= val <= INT64_MAX, so shifted multiplication result is inside [INT64_MIN, INT64_MAX] range
         uint64_t uval = (uint64_t)((val ^ (val >> 63)) - (val >> 63));
         uint64_t umul = (uint64_t)((val2.val ^ (val2.val >> 63)) - (val2.val >> 63));
         int64_t ressign = (val >> 63) ^ (val2.val >> 63);
@@ -61,6 +67,9 @@ public:
         uint64_t val0_h = (sh2   & 0xFFFFFFFF) + (sh1_0 >> 32) + (sh1_1 >> 32) + (val0_l >> 32);
         val0_l &= 0xFFFFFFFF;
 
+        if ( (sh2 >> 32) || (val0_h >> ressign ? 32 : 31) )
+            return (ressign ? ~(int64_t)0x7FFFFFFFFFFFFFFF : (int64_t)0x7FFFFFFFFFFFFFFF);
+
         if (ressign)
         {
             val0_l = (~val0_l + 1) & 0xFFFFFFFF;
@@ -68,16 +77,19 @@ public:
         }
         return (int64_t)(val0_h << 32 | val0_l);
     }
-    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const { return fixedpoint64(val + val2.val); }
-    CV_ALWAYS_INLINE fixedpoint64 operator - (const fixedpoint64& val2) const { return fixedpoint64(val - val2.val); }
-    //    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const
-    //    {
-    //        int64_t nfrac = (int64_t)frac + val2.frac;
-    //        int64_t nval = (int64_t)val + val2.val + nfrac >> 32;
-    //        return nval > MAXINT32 ? beConv(MAXINT32, MAXINT32) : beConv((int32_t)(nval), 0);
-    //    }
+    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const
+    {
+        int64_t res = val + val2.val;
+        return ((val ^ res) & (val2.val ^ res)) >> 63 ? ~(res & ~0x7FFFFFFFFFFFFFFF) : res;
+    }
+    CV_ALWAYS_INLINE fixedpoint64 operator - (const fixedpoint64& val2) const
+    {
+        int64_t res = val - val2.val;
+        return ((val ^ val2.val) & (val ^ res)) >> 63 ? ~(res & ~0x7FFFFFFFFFFFFFFF) : res;
+    }
     CV_ALWAYS_INLINE fixedpoint64 operator >> (int n) const { return fixedpoint64(val >> n); }
     CV_ALWAYS_INLINE fixedpoint64 operator << (int n) const { return fixedpoint64(val << n); }
+    CV_ALWAYS_INLINE bool operator == (const fixedpoint64& val2) const { return val == val2.val; }
     template <typename ET>
     CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int64_t)fixedround((uint64_t)val) >> fixedShift); }
     CV_ALWAYS_INLINE operator double() const { return (double)val / (1LL << fixedShift); }
@@ -108,31 +120,36 @@ public:
     CV_ALWAYS_INLINE ufixedpoint64& operator = (const uint32_t& _val) { val = ((uint64_t)_val) << fixedShift; return *this; }
     CV_ALWAYS_INLINE ufixedpoint64& operator = (const cv::softdouble& _val) { val = _val.getSign() ? 0 : (uint64_t)cvRound64(_val * cv::softdouble((int64_t)(1LL << fixedShift))); return *this; }
     CV_ALWAYS_INLINE ufixedpoint64& operator = (const ufixedpoint64& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE ufixedpoint64 operator * (const ET& val2) const { return val * val2; } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE ufixedpoint64 operator * (const uint8_t& val2) const { return operator *(ufixedpoint64(val2)); }
+    CV_ALWAYS_INLINE ufixedpoint64 operator * (const uint16_t& val2) const { return operator *(ufixedpoint64(val2)); }
+    CV_ALWAYS_INLINE ufixedpoint64 operator * (const uint32_t& val2) const { return operator *(ufixedpoint64(val2)); }
     CV_ALWAYS_INLINE ufixedpoint64 operator * (const ufixedpoint64& val2) const
     {
-        //Assume val2 <=0x0000000100000000, so shifted multiplication result is less than val and therefore than UINT64_MAX
         uint64_t sh0 = fixedround((val & 0xFFFFFFFF) * (val2.val & 0xFFFFFFFF));
         uint64_t sh1_0 = (val >> 32)        * (val2.val & 0xFFFFFFFF);
         uint64_t sh1_1 = (val & 0xFFFFFFFF) * (val2.val >> 32);
-        uint64_t sh2 = (val >> 32)        * (val2.val >> 32);
+        uint64_t sh2   = (val >> 32)        * (val2.val >> 32);
         uint64_t val0_l = (sh1_0 & 0xFFFFFFFF) + (sh1_1 & 0xFFFFFFFF) + (sh0 >> 32);
         uint64_t val0_h = (sh2 & 0xFFFFFFFF) + (sh1_0 >> 32) + (sh1_1 >> 32) + (val0_l >> 32);
         val0_l &= 0xFFFFFFFF;
 
+        if ((sh2 >> 32) || (val0_h >> 32))
+            return ((uint64_t)0xFFFFFFFFFFFFFFFF);
+
         return val0_h << 32 | val0_l;
     }
-    CV_ALWAYS_INLINE ufixedpoint64 operator + (const ufixedpoint64& val2) const { return ufixedpoint64(val + val2.val); }
-    CV_ALWAYS_INLINE ufixedpoint64 operator - (const ufixedpoint64& val2) const { return ufixedpoint64(val - val2.val); }
-    //    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const
-    //    {
-    //        int64_t nfrac = (int64_t)frac + val2.frac;
-    //        int64_t nval = (int64_t)val + val2.val + nfrac >> 32;
-    //        return nval > MAXINT32 ? beConv(MAXINT32, MAXINT32) : beConv((int32_t)(nval), 0);
-    //    }
+    CV_ALWAYS_INLINE ufixedpoint64 operator + (const ufixedpoint64& val2) const
+    {
+        uint64_t res = val + val2.val;
+        return (val > res) ? (uint64_t)0xFFFFFFFFFFFFFFFF : res;
+    }
+    CV_ALWAYS_INLINE ufixedpoint64 operator - (const ufixedpoint64& val2) const
+    {
+        return val > val2.val ? (val - val2.val) : 0;
+    }
     CV_ALWAYS_INLINE ufixedpoint64 operator >> (int n) const { return ufixedpoint64(val >> n); }
     CV_ALWAYS_INLINE ufixedpoint64 operator << (int n) const { return ufixedpoint64(val << n); }
+    CV_ALWAYS_INLINE bool operator == (const ufixedpoint64& val2) const { return val == val2.val; }
     template <typename ET>
     CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
     CV_ALWAYS_INLINE operator double() const { return (double)val / (1LL << fixedShift); }
@@ -163,21 +180,26 @@ public:
     CV_ALWAYS_INLINE fixedpoint32& operator = (const int16_t& _val) { val = ((int32_t)_val) << fixedShift; return *this; }
     CV_ALWAYS_INLINE fixedpoint32& operator = (const cv::softdouble& _val) { val = (int32_t)cvRound(_val * cv::softdouble((1 << fixedShift))); return *this; }
     CV_ALWAYS_INLINE fixedpoint32& operator = (const fixedpoint32& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE fixedpoint32 operator * (const ET& val2) const { return val * val2; } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE fixedpoint32 operator * (const int8_t& val2) const { return cv::saturate_cast<int32_t>((int64_t)val * val2); }
+    CV_ALWAYS_INLINE fixedpoint32 operator * (const uint8_t& val2) const { return cv::saturate_cast<int32_t>((int64_t)val * val2); }
+    CV_ALWAYS_INLINE fixedpoint32 operator * (const int16_t& val2) const { return cv::saturate_cast<int32_t>((int64_t)val * val2); }
     CV_ALWAYS_INLINE fixedpoint64 operator * (const fixedpoint32& val2) const { return (int64_t)val * (int64_t)(val2.val); }
-    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const { return fixedpoint32(val + val2.val); }
-    CV_ALWAYS_INLINE fixedpoint32 operator - (const fixedpoint32& val2) const { return fixedpoint32(val - val2.val); }
-    //    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const
-    //    {
-    //        int32_t nfrac = (int32_t)frac + val2.frac;
-    //        int32_t nval = (int32_t)val + val2.val + nfrac >> 32;
-    //        return nval > MAXINT32 ? beConv(MAXINT32, MAXINT32) : beConv((int32_t)(nval), 0);
-    //    }
+    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const
+    {
+        int32_t res = val + val2.val;
+        return ((val ^ res) & (val2.val ^ res)) >> 31 ? ~(res & ~0x7FFFFFFF) : res;
+    }
+    CV_ALWAYS_INLINE fixedpoint32 operator - (const fixedpoint32& val2) const
+    {
+        int32_t res = val - val2.val;
+        return ((val ^ val2.val) & (val ^ res)) >> 31 ? ~(res & ~0x7FFFFFFF) : res;
+    }
     CV_ALWAYS_INLINE fixedpoint32 operator >> (int n) const { return fixedpoint32(val >> n); }
     CV_ALWAYS_INLINE fixedpoint32 operator << (int n) const { return fixedpoint32(val << n); }
+    CV_ALWAYS_INLINE bool operator == (const fixedpoint32& val2) const { return val == val2.val; }
     template <typename ET>
     CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int32_t)fixedround((uint32_t)val) >> fixedShift); }
+    CV_ALWAYS_INLINE operator fixedpoint64() const { return (int64_t)val << (fixedpoint64::fixedShift - fixedShift); }
     CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
     CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
     CV_ALWAYS_INLINE bool isZero() { return val == 0; }
@@ -204,21 +226,24 @@ public:
     CV_ALWAYS_INLINE ufixedpoint32& operator = (const uint16_t& _val) { val = ((uint32_t)_val) << fixedShift; return *this; }
     CV_ALWAYS_INLINE ufixedpoint32& operator = (const cv::softdouble& _val) { val = _val.getSign() ? 0 : (uint32_t)cvRound(_val * cv::softdouble((1 << fixedShift))); return *this; }
     CV_ALWAYS_INLINE ufixedpoint32& operator = (const ufixedpoint32& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE ufixedpoint32 operator * (const ET& val2) const { return val * val2; } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE ufixedpoint32 operator * (const uint8_t& val2) const { return cv::saturate_cast<uint32_t>((uint64_t)val * val2); }
+    CV_ALWAYS_INLINE ufixedpoint32 operator * (const uint16_t& val2) const { return cv::saturate_cast<uint32_t>((uint64_t)val * val2); }
     CV_ALWAYS_INLINE ufixedpoint64 operator * (const ufixedpoint32& val2) const { return (uint64_t)val * (uint64_t)(val2.val); }
-    CV_ALWAYS_INLINE ufixedpoint32 operator + (const ufixedpoint32& val2) const { return ufixedpoint32(val + val2.val); }
-    CV_ALWAYS_INLINE ufixedpoint32 operator - (const ufixedpoint32& val2) const { return ufixedpoint32(val - val2.val); }
-    //    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const
-    //    {
-    //        int32_t nfrac = (int32_t)frac + val2.frac;
-    //        int32_t nval = (int32_t)val + val2.val + nfrac >> 32;
-    //        return nval > MAXINT32 ? beConv(MAXINT32, MAXINT32) : beConv((int32_t)(nval), 0);
-    //    }
+    CV_ALWAYS_INLINE ufixedpoint32 operator + (const ufixedpoint32& val2) const
+    {
+        uint32_t res = val + val2.val;
+        return (val > res) ? 0xFFFFFFFF : res;
+    }
+    CV_ALWAYS_INLINE ufixedpoint32 operator - (const ufixedpoint32& val2) const
+    {
+        return val > val2.val ? (val - val2.val) : 0;
+    }
     CV_ALWAYS_INLINE ufixedpoint32 operator >> (int n) const { return ufixedpoint32(val >> n); }
     CV_ALWAYS_INLINE ufixedpoint32 operator << (int n) const { return ufixedpoint32(val << n); }
+    CV_ALWAYS_INLINE bool operator == (const ufixedpoint32& val2) const { return val == val2.val; }
     template <typename ET>
     CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
+    CV_ALWAYS_INLINE operator ufixedpoint64() const { return (uint64_t)val << (ufixedpoint64::fixedShift - fixedShift); }
     CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
     CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
     CV_ALWAYS_INLINE bool isZero() { return val == 0; }
@@ -239,20 +264,28 @@ public:
     typedef fixedpoint32 WT;
     CV_ALWAYS_INLINE fixedpoint16() { val = 0; }
     CV_ALWAYS_INLINE fixedpoint16(const int8_t& _val) { val = ((int16_t)_val) << fixedShift; }
-    CV_ALWAYS_INLINE fixedpoint16(const uint8_t& _val) { val = ((int16_t)_val) << fixedShift; }
     CV_ALWAYS_INLINE fixedpoint16(const cv::softdouble& _val) { val = (int16_t)cvRound(_val * cv::softdouble((1 << fixedShift))); }
     CV_ALWAYS_INLINE fixedpoint16& operator = (const int8_t& _val) { val = ((int16_t)_val) << fixedShift; return *this; }
     CV_ALWAYS_INLINE fixedpoint16& operator = (const cv::softdouble& _val) { val = (int16_t)cvRound(_val * cv::softdouble((1 << fixedShift))); return *this; }
     CV_ALWAYS_INLINE fixedpoint16& operator = (const fixedpoint16& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE fixedpoint16 operator * (const ET& val2) const { return (int16_t)(val * val2); } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE fixedpoint16 operator * (const int8_t& val2) const { return cv::saturate_cast<int16_t>((int32_t)val * val2); }
     CV_ALWAYS_INLINE fixedpoint32 operator * (const fixedpoint16& val2) const { return (int32_t)val * (int32_t)(val2.val); }
-    CV_ALWAYS_INLINE fixedpoint16 operator + (const fixedpoint16& val2) const { return fixedpoint16((int16_t)(val + val2.val)); }
-    CV_ALWAYS_INLINE fixedpoint16 operator - (const fixedpoint16& val2) const { return fixedpoint16((int16_t)(val - val2.val)); }
+    CV_ALWAYS_INLINE fixedpoint16 operator + (const fixedpoint16& val2) const
+    {
+        int16_t res = val + val2.val;
+        return ((val ^ res) & (val2.val ^ res)) >> 15 ? (int16_t)(~(res & ~0x7FFF)) : res;
+    }
+    CV_ALWAYS_INLINE fixedpoint16 operator - (const fixedpoint16& val2) const
+    {
+        int16_t res = val - val2.val;
+        return ((val ^ val2.val) & (val ^ res)) >> 15 ? (int16_t)(~(res & ~(int16_t)0x7FFF)) : res;
+    }
     CV_ALWAYS_INLINE fixedpoint16 operator >> (int n) const { return fixedpoint16((int16_t)(val >> n)); }
     CV_ALWAYS_INLINE fixedpoint16 operator << (int n) const { return fixedpoint16((int16_t)(val << n)); }
+    CV_ALWAYS_INLINE bool operator == (const fixedpoint16& val2) const { return val == val2.val; }
     template <typename ET>
     CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int16_t)fixedround((uint16_t)val) >> fixedShift); }
+    CV_ALWAYS_INLINE operator fixedpoint32() const { return (int32_t)val << (fixedpoint32::fixedShift - fixedShift); }
     CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
     CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
     CV_ALWAYS_INLINE bool isZero() { return val == 0; }
@@ -276,15 +309,23 @@ public:
     CV_ALWAYS_INLINE ufixedpoint16& operator = (const uint8_t& _val) { val = ((uint16_t)_val) << fixedShift; return *this; }
     CV_ALWAYS_INLINE ufixedpoint16& operator = (const cv::softdouble& _val) { val = _val.getSign() ? 0 : (uint16_t)cvRound(_val * cv::softdouble((int32_t)(1 << fixedShift))); return *this; }
     CV_ALWAYS_INLINE ufixedpoint16& operator = (const ufixedpoint16& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE ufixedpoint16 operator * (const ET& val2) const { return (uint16_t)(val * val2); } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE ufixedpoint16 operator * (const uint8_t& val2) const { return cv::saturate_cast<uint16_t>((uint32_t)val * val2); }
     CV_ALWAYS_INLINE ufixedpoint32 operator * (const ufixedpoint16& val2) const { return ((uint32_t)val * (uint32_t)(val2.val)); }
-    CV_ALWAYS_INLINE ufixedpoint16 operator + (const ufixedpoint16& val2) const { return ufixedpoint16((uint16_t)(val + val2.val)); }
-    CV_ALWAYS_INLINE ufixedpoint16 operator - (const ufixedpoint16& val2) const { return ufixedpoint16((uint16_t)(val - val2.val)); }
+    CV_ALWAYS_INLINE ufixedpoint16 operator + (const ufixedpoint16& val2) const
+    {
+        uint16_t res = val + val2.val;
+        return (val > res) ? (uint16_t)0xFFFF : res;
+    }
+    CV_ALWAYS_INLINE ufixedpoint16 operator - (const ufixedpoint16& val2) const
+    {
+        return val > val2.val ? (uint16_t)(val - val2.val) : (uint16_t)0;
+    }
     CV_ALWAYS_INLINE ufixedpoint16 operator >> (int n) const { return ufixedpoint16((uint16_t)(val >> n)); }
     CV_ALWAYS_INLINE ufixedpoint16 operator << (int n) const { return ufixedpoint16((uint16_t)(val << n)); }
+    CV_ALWAYS_INLINE bool operator == (const ufixedpoint16& val2) const { return val == val2.val; }
     template <typename ET>
     CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
+    CV_ALWAYS_INLINE operator ufixedpoint32() const { return (uint32_t)val << (ufixedpoint32::fixedShift - fixedShift); }
     CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
     CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
     CV_ALWAYS_INLINE bool isZero() { return val == 0; }
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index d793616cfc..58ae385585 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -50,8 +50,9 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "hal_replacement.hpp"
-
+#include "opencv2/core/hal/intrin.hpp"
 #include "opencv2/core/openvx/ovx_defs.hpp"
+#include "opencv2/core/softfloat.hpp"
 #include "imgwarp.hpp"
 
 using namespace cv;
@@ -130,7 +131,7 @@ static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
 
-#if CV_SSE2 || CV_NEON
+#if CV_SIMD128
 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
 #endif
@@ -172,7 +173,7 @@ static inline void interpolateLanczos4( float x, float* coeffs )
     }
 
     float sum = 0;
-    double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
+    double y0=-(x+3)*CV_PI*0.25, s0 = std::sin(y0), c0= std::cos(y0);
     for(int i = 0; i < 8; i++ )
     {
         double y = -(x+3-i)*CV_PI*0.25;
@@ -266,7 +267,7 @@ static const void* initInterTab2D( int method, bool fixpt )
             }
         tab -= INTER_TAB_SIZE2*ksize*ksize;
         itab -= INTER_TAB_SIZE2*ksize*ksize;
-#if CV_SSE2 || CV_NEON
+#if CV_SIMD128
         if( method == INTER_LINEAR )
         {
             for( i = 0; i < INTER_TAB_SIZE2; i++ )
@@ -432,7 +433,7 @@ struct RemapNoVec
                     const void*, int ) const { return 0; }
 };
 
-#if CV_SSE2
+#if CV_SIMD128
 
 struct RemapVec_8u
 {
@@ -441,190 +442,192 @@ struct RemapVec_8u
     {
         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
 
-        if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
+        if( (cn != 1 && cn != 3 && cn != 4) || !hasSIMD128() ||
             sstep > 0x8000 )
             return 0;
 
         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
         uchar* D = (uchar*)_dst;
-        __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
-        __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
-        __m128i z = _mm_setzero_si128();
+        v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2);
+        v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16)));
         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
+        const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes;
+#define CV_PICK_AND_PACK_RGB(ptr, offset, result)  \
+        {                                          \
+            const uchar* const p = ((const uchar*)ptr) + (offset); \
+            if (p <= src_limit_8bytes)             \
+            {                                      \
+                v_uint8x16 rrggbb, dummy;          \
+                v_uint16x8 rrggbb8, dummy8;        \
+                v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4(*(int*)(p), 0, 0, 0)); \
+                v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4(*(int*)(p + 3), 0, 0, 0)); \
+                v_zip(rgb0, rgb1, rrggbb, dummy);  \
+                v_expand(rrggbb, rrggbb8, dummy8); \
+                result = v_reinterpret_as_s16(rrggbb8); \
+            }                                      \
+            else                                   \
+            {                                      \
+                result = v_int16x8((short)p[0], (short)p[3], /* r0r1 */ \
+                                   (short)p[1], (short)p[4], /* g0g1 */ \
+                                   (short)p[2], (short)p[5], /* b0b1 */ 0, 0); \
+            }                                      \
+        }
+#define CV_PICK_AND_PACK_RGBA(ptr, offset, result) \
+        {                                          \
+            const uchar* const p = ((const uchar*)ptr) + (offset); \
+            CV_DbgAssert(p <= src_limit_8bytes);   \
+            v_uint8x16 rrggbbaa, dummy;            \
+            v_uint16x8 rrggbbaa8, dummy8;          \
+            v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(int*)(p), 0, 0, 0)); \
+            v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
+            v_zip(rgba0, rgba1, rrggbbaa, dummy);  \
+            v_expand(rrggbbaa, rrggbbaa8, dummy8); \
+            result = v_reinterpret_as_s16(rrggbbaa8); \
+        }
+#define CV_PICK_AND_PACK4(base,offset)             \
+            v_uint16x8(*(ushort*)(base + offset[0]), *(ushort*)(base + offset[1]), \
+                       *(ushort*)(base + offset[2]), *(ushort*)(base + offset[3]), \
+                       0, 0, 0, 0)
 
         if( cn == 1 )
         {
             for( ; x <= width - 8; x += 8 )
             {
-                __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
-                __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
-                __m128i v0, v1, v2, v3, a0, a1, b0, b1;
-                unsigned i0, i1;
-
-                xy0 = _mm_madd_epi16( xy0, xy2ofs );
-                xy1 = _mm_madd_epi16( xy1, xy2ofs );
-                _mm_store_si128( (__m128i*)iofs0, xy0 );
-                _mm_store_si128( (__m128i*)iofs1, xy1 );
-
-                i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
-                i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
-                v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
-                i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
-                i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
-                v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
-                v0 = _mm_unpacklo_epi8(v0, z);
-                v1 = _mm_unpacklo_epi8(v1, z);
-
-                a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
-                                        _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
-                a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
-                                        _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
-                b0 = _mm_unpacklo_epi64(a0, a1);
-                b1 = _mm_unpackhi_epi64(a0, a1);
-                v0 = _mm_madd_epi16(v0, b0);
-                v1 = _mm_madd_epi16(v1, b1);
-                v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
-
-                i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
-                i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
-                v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
-                i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
-                i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
-                v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
-                v2 = _mm_unpacklo_epi8(v2, z);
-                v3 = _mm_unpacklo_epi8(v3, z);
-
-                a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
-                                        _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
-                a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
-                                        _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
-                b0 = _mm_unpacklo_epi64(a0, a1);
-                b1 = _mm_unpackhi_epi64(a0, a1);
-                v2 = _mm_madd_epi16(v2, b0);
-                v3 = _mm_madd_epi16(v3, b1);
-                v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
-
-                v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
-                v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
-                v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
-                _mm_storel_epi64( (__m128i*)(D + x), v0 );
+                v_int16x8 _xy0 = v_load(XY + x*2);
+                v_int16x8 _xy1 = v_load(XY + x*2 + 8);
+                v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2;
+
+                v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
+                v_int32x4 xy1 = v_dotprod( _xy1, xy2ofs );
+                v_store( iofs0, xy0 );
+                v_store( iofs1, xy1 );
+
+                v_uint16x8 stub, dummy;
+                v_uint16x8 vec16;
+                vec16 = CV_PICK_AND_PACK4(S0, iofs0);
+                v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
+                v0 = v_reinterpret_as_s32(stub);
+                vec16 = CV_PICK_AND_PACK4(S1, iofs0);
+                v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
+                v1 = v_reinterpret_as_s32(stub);
+
+                v_zip(v_load_low((int*)(wtab + FXY[x]     * 4)), v_load_low((int*)(wtab + FXY[x + 1] * 4)), a0, a1);
+                v_zip(v_load_low((int*)(wtab + FXY[x + 2] * 4)), v_load_low((int*)(wtab + FXY[x + 3] * 4)), b0, b1);
+                v_recombine(a0, b0, a2, b2);
+                v1 = v_dotprod(v_reinterpret_as_s16(v1), v_reinterpret_as_s16(b2), delta);
+                v0 = v_dotprod(v_reinterpret_as_s16(v0), v_reinterpret_as_s16(a2), v1);
+
+                vec16 = CV_PICK_AND_PACK4(S0, iofs1);
+                v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
+                v2 = v_reinterpret_as_s32(stub);
+                vec16 = CV_PICK_AND_PACK4(S1, iofs1);
+                v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
+                v3 = v_reinterpret_as_s32(stub);
+
+                v_zip(v_load_low((int*)(wtab + FXY[x + 4] * 4)), v_load_low((int*)(wtab + FXY[x + 5] * 4)), c0, c1);
+                v_zip(v_load_low((int*)(wtab + FXY[x + 6] * 4)), v_load_low((int*)(wtab + FXY[x + 7] * 4)), d0, d1);
+                v_recombine(c0, d0, c2, d2);
+                v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta);
+                v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
+
+                v0 = v0 >> INTER_REMAP_COEF_BITS;
+                v2 = v2 >> INTER_REMAP_COEF_BITS;
+                v_pack_u_store(D + x, v_pack(v0, v2));
             }
         }
         else if( cn == 3 )
         {
             for( ; x <= width - 5; x += 4, D += 12 )
             {
-                __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
-                __m128i u0, v0, u1, v1;
-
-                xy0 = _mm_madd_epi16( xy0, xy2ofs );
-                _mm_store_si128( (__m128i*)iofs0, xy0 );
-                const __m128i *w0, *w1;
-                w0 = (const __m128i*)(wtab + FXY[x]*16);
-                w1 = (const __m128i*)(wtab + FXY[x+1]*16);
-
-                u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
-                                       _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
-                v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
-                                       _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
-                u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
-                                       _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
-                v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
-                                       _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
-                u0 = _mm_unpacklo_epi8(u0, z);
-                v0 = _mm_unpacklo_epi8(v0, z);
-                u1 = _mm_unpacklo_epi8(u1, z);
-                v1 = _mm_unpacklo_epi8(v1, z);
-                u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
-                u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
-                u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
-                u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
-                u0 = _mm_slli_si128(u0, 4);
-                u0 = _mm_packs_epi32(u0, u1);
-                u0 = _mm_packus_epi16(u0, u0);
-                _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
-
-                w0 = (const __m128i*)(wtab + FXY[x+2]*16);
-                w1 = (const __m128i*)(wtab + FXY[x+3]*16);
-
-                u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
-                                       _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
-                v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
-                                       _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
-                u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
-                                       _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
-                v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
-                                       _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
-                u0 = _mm_unpacklo_epi8(u0, z);
-                v0 = _mm_unpacklo_epi8(v0, z);
-                u1 = _mm_unpacklo_epi8(u1, z);
-                v1 = _mm_unpacklo_epi8(v1, z);
-                u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
-                u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
-                u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
-                u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
-                u0 = _mm_slli_si128(u0, 4);
-                u0 = _mm_packs_epi32(u0, u1);
-                u0 = _mm_packus_epi16(u0, u0);
-                _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
+                v_int16x8 u0, v0, u1, v1;
+                v_int16x8 _xy0 = v_load(XY + x * 2);
+
+                v_int32x4 xy0 = v_dotprod(_xy0, xy2ofs);
+                v_store(iofs0, xy0);
+
+                int offset0 = FXY[x] * 16;
+                int offset1 = FXY[x + 1] * 16;
+                int offset2 = FXY[x + 2] * 16;
+                int offset3 = FXY[x + 3] * 16;
+                v_int16x8 w00 = v_load(wtab + offset0);
+                v_int16x8 w01 = v_load(wtab + offset0 + 8);
+                v_int16x8 w10 = v_load(wtab + offset1);
+                v_int16x8 w11 = v_load(wtab + offset1 + 8);
+
+                CV_PICK_AND_PACK_RGB(S0, iofs0[0], u0);
+                CV_PICK_AND_PACK_RGB(S1, iofs0[0], v0);
+                CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1);
+                CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1);
+
+                v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
+                v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+
+                result0 = v_rotate_left<1>(result0);
+                v_int16x8 result8 = v_pack(result0, result1);
+                v_uint8x16 result16 = v_pack_u(result8, result8);
+                v_store_low(D, v_rotate_right<1>(result16));
+
+
+                w00 = v_load(wtab + offset2);
+                w01 = v_load(wtab + offset2 + 8);
+                w10 = v_load(wtab + offset3);
+                w11 = v_load(wtab + offset3 + 8);
+                CV_PICK_AND_PACK_RGB(S0, iofs0[2], u0);
+                CV_PICK_AND_PACK_RGB(S1, iofs0[2], v0);
+                CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1);
+                CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1);
+
+                result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
+                result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+
+                result0 = v_rotate_left<1>(result0);
+                result8 = v_pack(result0, result1);
+                result16 = v_pack_u(result8, result8);
+                v_store_low(D + 6, v_rotate_right<1>(result16));
             }
         }
         else if( cn == 4 )
         {
             for( ; x <= width - 4; x += 4, D += 16 )
             {
-                __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
-                __m128i u0, v0, u1, v1;
-
-                xy0 = _mm_madd_epi16( xy0, xy2ofs );
-                _mm_store_si128( (__m128i*)iofs0, xy0 );
-                const __m128i *w0, *w1;
-                w0 = (const __m128i*)(wtab + FXY[x]*16);
-                w1 = (const __m128i*)(wtab + FXY[x+1]*16);
-
-                u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
-                                       _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
-                v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
-                                       _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
-                u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
-                                       _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
-                v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
-                                       _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
-                u0 = _mm_unpacklo_epi8(u0, z);
-                v0 = _mm_unpacklo_epi8(v0, z);
-                u1 = _mm_unpacklo_epi8(u1, z);
-                v1 = _mm_unpacklo_epi8(v1, z);
-                u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
-                u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
-                u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
-                u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
-                u0 = _mm_packs_epi32(u0, u1);
-                u0 = _mm_packus_epi16(u0, u0);
-                _mm_storel_epi64((__m128i*)D, u0);
-
-                w0 = (const __m128i*)(wtab + FXY[x+2]*16);
-                w1 = (const __m128i*)(wtab + FXY[x+3]*16);
-
-                u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
-                                       _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
-                v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
-                                       _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
-                u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
-                                       _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
-                v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
-                                       _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
-                u0 = _mm_unpacklo_epi8(u0, z);
-                v0 = _mm_unpacklo_epi8(v0, z);
-                u1 = _mm_unpacklo_epi8(u1, z);
-                v1 = _mm_unpacklo_epi8(v1, z);
-                u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
-                u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
-                u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
-                u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
-                u0 = _mm_packs_epi32(u0, u1);
-                u0 = _mm_packus_epi16(u0, u0);
-                _mm_storel_epi64((__m128i*)(D + 8), u0);
+                v_int16x8 _xy0 = v_load(XY + x * 2);
+                v_int16x8 u0, v0, u1, v1;
+
+                v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
+                v_store(iofs0, xy0);
+                int offset0 = FXY[x] * 16;
+                int offset1 = FXY[x + 1] * 16;
+                int offset2 = FXY[x + 2] * 16;
+                int offset3 = FXY[x + 3] * 16;
+
+                v_int16x8 w00 = v_load(wtab + offset0);
+                v_int16x8 w01 = v_load(wtab + offset0 + 8);
+                v_int16x8 w10 = v_load(wtab + offset1);
+                v_int16x8 w11 = v_load(wtab + offset1 + 8);
+                CV_PICK_AND_PACK_RGBA(S0, iofs0[0], u0);
+                CV_PICK_AND_PACK_RGBA(S1, iofs0[0], v0);
+                CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1);
+                CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1);
+
+                v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
+                v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                v_int16x8 result8 = v_pack(result0, result1);
+                v_pack_u_store(D, result8);
+
+                w00 = v_load(wtab + offset2);
+                w01 = v_load(wtab + offset2 + 8);
+                w10 = v_load(wtab + offset3);
+                w11 = v_load(wtab + offset3 + 8);
+                CV_PICK_AND_PACK_RGBA(S0, iofs0[2], u0);
+                CV_PICK_AND_PACK_RGBA(S1, iofs0[2], v0);
+                CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1);
+                CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1);
+
+                result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
+                result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
+                result8 = v_pack(result0, result1);
+                v_pack_u_store(D + 8, result8);
             }
         }
 
@@ -660,7 +663,7 @@ static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
 
     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
     CV_Assert( ssize.area() > 0 );
-#if CV_SSE2
+#if CV_SIMD128
     if( _src.type() == CV_8UC3 )
         width1 = std::max(ssize.width-2, 0);
 #endif
@@ -1091,9 +1094,9 @@ public:
         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
         int bcols0 = std::min(buf_size/brows0, dst->cols);
         brows0 = std::min(buf_size/bcols0, dst->rows);
-    #if CV_SSE2
-        bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
-    #endif
+#if CV_SIMD128
+        bool useSIMD = hasSIMD128();
+#endif
 
         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
         if( !nnfunc )
@@ -1139,29 +1142,24 @@ public:
                             const float* sY = m2->ptr<float>(y+y1) + x;
                             x1 = 0;
 
-                        #if CV_SSE2
+                            #if CV_SIMD128
                             if( useSIMD )
                             {
-                                for( ; x1 <= bcols - 8; x1 += 8 )
+                                int span = v_float32x4::nlanes;
+                                for( ; x1 <= bcols - span * 2; x1 += span * 2 )
                                 {
-                                    __m128 fx0 = _mm_loadu_ps(sX + x1);
-                                    __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
-                                    __m128 fy0 = _mm_loadu_ps(sY + x1);
-                                    __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
-                                    __m128i ix0 = _mm_cvtps_epi32(fx0);
-                                    __m128i ix1 = _mm_cvtps_epi32(fx1);
-                                    __m128i iy0 = _mm_cvtps_epi32(fy0);
-                                    __m128i iy1 = _mm_cvtps_epi32(fy1);
-                                    ix0 = _mm_packs_epi32(ix0, ix1);
-                                    iy0 = _mm_packs_epi32(iy0, iy1);
-                                    ix1 = _mm_unpacklo_epi16(ix0, iy0);
-                                    iy1 = _mm_unpackhi_epi16(ix0, iy0);
-                                    _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
-                                    _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
+                                    v_int32x4 ix0 = v_round(v_load(sX + x1));
+                                    v_int32x4 iy0 = v_round(v_load(sY + x1));
+                                    v_int32x4 ix1 = v_round(v_load(sX + x1 + span));
+                                    v_int32x4 iy1 = v_round(v_load(sY + x1 + span));
+
+                                    v_int16x8 dx, dy;
+                                    dx = v_pack(ix0, ix1);
+                                    dy = v_pack(iy0, iy1);
+                                    v_store_interleave(XY + x1 * 2, dx, dy);
                                 }
                             }
-                        #endif
-
+                            #endif
                             for( ; x1 < bcols; x1++ )
                             {
                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
@@ -1186,16 +1184,15 @@ public:
                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
                         x1 = 0;
 
-                    #if CV_NEON
-                        uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
-                        for ( ; x1 <= bcols - 8; x1 += 8)
-                            vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
-                    #elif CV_SSE2
-                        __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1);
-                        for ( ; x1 <= bcols - 8; x1 += 8)
-                            _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale));
-                    #endif
-
+                        #if CV_SIMD128
+                        if (useSIMD)
+                        {
+                            v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1);
+                            int span = v_uint16x8::nlanes;
+                            for( ; x1 <= bcols - span; x1 += span )
+                                v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale);
+                        }
+                        #endif
                         for( ; x1 < bcols; x1++ )
                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
                     }
@@ -1205,60 +1202,29 @@ public:
                         const float* sY = m2->ptr<float>(y+y1) + x;
 
                         x1 = 0;
-                    #if CV_SSE2
+                        #if CV_SIMD128
                         if( useSIMD )
                         {
-                            __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
-                            __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
-                            for( ; x1 <= bcols - 8; x1 += 8 )
+                            v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
+                            v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1);
+                            int span = v_float32x4::nlanes;
+                            for( ; x1 <= bcols - span * 2; x1 += span * 2 )
                             {
-                                __m128 fx0 = _mm_loadu_ps(sX + x1);
-                                __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
-                                __m128 fy0 = _mm_loadu_ps(sY + x1);
-                                __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
-                                __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
-                                __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
-                                __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
-                                __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
-                                __m128i mx0 = _mm_and_si128(ix0, mask);
-                                __m128i mx1 = _mm_and_si128(ix1, mask);
-                                __m128i my0 = _mm_and_si128(iy0, mask);
-                                __m128i my1 = _mm_and_si128(iy1, mask);
-                                mx0 = _mm_packs_epi32(mx0, mx1);
-                                my0 = _mm_packs_epi32(my0, my1);
-                                my0 = _mm_slli_epi16(my0, INTER_BITS);
-                                mx0 = _mm_or_si128(mx0, my0);
-                                _mm_storeu_si128((__m128i*)(A + x1), mx0);
-                                ix0 = _mm_srai_epi32(ix0, INTER_BITS);
-                                ix1 = _mm_srai_epi32(ix1, INTER_BITS);
-                                iy0 = _mm_srai_epi32(iy0, INTER_BITS);
-                                iy1 = _mm_srai_epi32(iy1, INTER_BITS);
-                                ix0 = _mm_packs_epi32(ix0, ix1);
-                                iy0 = _mm_packs_epi32(iy0, iy1);
-                                ix1 = _mm_unpacklo_epi16(ix0, iy0);
-                                iy1 = _mm_unpackhi_epi16(ix0, iy0);
-                                _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
-                                _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
+                                v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1));
+                                v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1));
+                                v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span));
+                                v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span));
+                                v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2));
+                                v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2));
+                                v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8);
+                                v_store(A + x1, v_v);
+
+                                v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
+                                v_int16x8 v_d1 = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1));
+                                v_store_interleave(XY + (x1 << 1), v_d0, v_d1);
                             }
                         }
-                    #elif CV_NEON
-                        float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
-                        int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
-
-                        for( ; x1 <= bcols - 4; x1 += 4 )
-                        {
-                            int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
-                                      v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
-                            int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
-                                                      vandq_s32(v_sy, v_scale2));
-                            vst1_u16(A + x1, vqmovun_s32(v_v));
-
-                            int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
-                                                         vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
-                            vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
-                        }
-                    #endif
-
+                        #endif
                         for( ; x1 < bcols; x1++ )
                         {
                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
@@ -1274,26 +1240,33 @@ public:
                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
                         x1 = 0;
 
-                    #if CV_NEON
-                        float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
-                        int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
-
-                        for( ; x1 <= bcols - 4; x1 += 4 )
+                        #if CV_SIMD128
+                        if( useSIMD )
                         {
-                            float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
-                            int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
-                            int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
-                            int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
-                                                      vandq_s32(v_sy, v_scale2));
-                            vst1_u16(A + x1, vqmovun_s32(v_v));
-
-                            int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
-                                                         vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
-                            vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
+                            v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
+                            v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE);
+                            int span = v_float32x4::nlanes;
+                            for( ; x1 <= bcols - span * 2; x1 += span * 2 )
+                            {
+                                v_float32x4 v_fx, v_fy;
+                                v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy);
+                                v_int32x4 v_sx0 = v_round(v_fx * v_scale);
+                                v_int32x4 v_sy0 = v_round(v_fy * v_scale);
+                                v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy);
+                                v_int32x4 v_sx1 = v_round(v_fx * v_scale);
+                                v_int32x4 v_sy1 = v_round(v_fy * v_scale);
+                                v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2));
+                                v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2));
+                                v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1));
+                                v_store(A + x1, v_v8);
+                                v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
+                                v_int16x8 v_dy = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1));
+                                v_store_interleave(XY + (x1 << 1), v_dx, v_dy);
+                            }
                         }
-                    #endif
+                        #endif
 
-                        for( x1 = 0; x1 < bcols; x1++ )
+                        for( ; x1 < bcols; x1++ )
                         {
                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
@@ -1915,8 +1888,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         size.height = 1;
     }
 
-#if CV_SSE2
-    bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+#if CV_SIMD128
+    bool useSIMD = hasSIMD128();
 #endif
 #if CV_TRY_SSE4_1
     bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
@@ -1941,67 +1914,75 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         {
             if( nninterpolate )
             {
-                #if CV_NEON
-                for( ; x <= size.width - 8; x += 8 )
-                {
-                    int16x8x2_t v_dst;
-                    v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
-                                                vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))));
-                    v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))),
-                                                vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4))));
-
-                    vst2q_s16(dst1 + (x << 1), v_dst);
-                }
-                #elif CV_TRY_SSE4_1
+                #if CV_TRY_SSE4_1
                 if (useSSE4_1)
                     opt_SSE4_1::convertMaps_nninterpolate32f1c16s_SSE41(src1f, src2f, dst1, size.width);
                 else
                 #endif
-                for( ; x < size.width; x++ )
                 {
-                    dst1[x*2] = saturate_cast<short>(src1f[x]);
-                    dst1[x*2+1] = saturate_cast<short>(src2f[x]);
+                    #if CV_SIMD128
+                    if( useSIMD )
+                    {
+                        int span = v_int16x8::nlanes;
+                        for( ; x <= size.width - span; x += span )
+                        {
+                            v_int16x8 v_dst[2];
+                            #define CV_PACK_MAP(X) v_pack(v_round(v_load(X)), v_round(v_load((X)+4)))
+                            v_dst[0] = CV_PACK_MAP(src1f + x);
+                            v_dst[1] = CV_PACK_MAP(src2f + x);
+                            #undef CV_PACK_MAP
+                            v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
+                        }
+                    }
+                    #endif
+                    for( ; x < size.width; x++ )
+                    {
+                        dst1[x*2] = saturate_cast<short>(src1f[x]);
+                        dst1[x*2+1] = saturate_cast<short>(src2f[x]);
+                    }
                 }
             }
             else
             {
-                #if CV_NEON
-                float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
-                int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
-
-                for( ; x <= size.width - 8; x += 8 )
-                {
-                    int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale));
-                    int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale));
-                    int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale));
-                    int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale));
-
-                    int16x8x2_t v_dst;
-                    v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
-                                                vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
-                    v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
-                                                vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
-
-                    vst2q_s16(dst1 + (x << 1), v_dst);
-
-                    uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
-                                                              vandq_s32(v_ix0, v_mask)));
-                    uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
-                                                              vandq_s32(v_ix1, v_mask)));
-                    vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
-                }
-                #elif CV_TRY_SSE4_1
+                #if CV_TRY_SSE4_1
                 if (useSSE4_1)
                     opt_SSE4_1::convertMaps_32f1c16s_SSE41(src1f, src2f, dst1, dst2, size.width);
                 else
                 #endif
-                for( ; x < size.width; x++ )
                 {
-                    int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
-                    int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
-                    dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
-                    dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
-                    dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
+                    #if CV_SIMD128
+                    if( useSIMD )
+                    {
+                        v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
+                        v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
+                        v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
+                        int span = v_float32x4::nlanes;
+                        for( ; x <= size.width - span * 2; x += span * 2 )
+                        {
+                            v_int32x4 v_ix0 = v_round(v_scale * (v_load(src1f + x)));
+                            v_int32x4 v_ix1 = v_round(v_scale * (v_load(src1f + x + span)));
+                            v_int32x4 v_iy0 = v_round(v_scale * (v_load(src2f + x)));
+                            v_int32x4 v_iy1 = v_round(v_scale * (v_load(src2f + x + span)));
+
+                            v_int16x8 v_dst[2];
+                            v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
+                            v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
+                            v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
+
+                            v_int32x4 v_dst0 = v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask));
+                            v_int32x4 v_dst1 = v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask));
+                            v_store(dst2 + x, v_pack_u(v_dst0, v_dst1));
+                        }
+                    }
+                    #endif
+                    for( ; x < size.width; x++ )
+                    {
+                        int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
+                        int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
+                        dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
+                        dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
+                        dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
+                    }
                 }
             }
         }
@@ -2009,16 +1990,12 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         {
             if( nninterpolate )
             {
-                #if CV_NEON
-                for( ; x <= (size.width << 1) - 8; x += 8 )
-                    vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
-                                                     vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
-                #elif CV_SSE2
-                for( ; x <= (size.width << 1) - 8; x += 8 )
-                {
-                    _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
-                                                                            _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
-                }
+                #if CV_SIMD128
+                int span = v_float32x4::nlanes;
+                if( useSIMD )
+                    for( ; x <= (size.width << 1) - span * 2; x += span * 2 )
+                        v_store(dst1 + x, v_pack(v_round(v_load(src1f + x)),
+                                                 v_round(v_load(src1f + x + span))));
                 #endif
                 for( ; x < size.width; x++ )
                 {
@@ -2028,118 +2005,92 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
             }
             else
             {
-                #if CV_NEON
-                float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
-                int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
-
-                for( ; x <= size.width - 8; x += 8 )
-                {
-                    float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8);
-                    int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale));
-                    int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale));
-                    int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale));
-                    int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale));
-
-                    int16x8x2_t v_dst;
-                    v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
-                                                vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
-                    v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
-                                                vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
-
-                    vst2q_s16(dst1 + (x << 1), v_dst);
-
-                    uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
-                                                              vandq_s32(v_ix0, v_mask)));
-                    uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
-                                                              vandq_s32(v_ix1, v_mask)));
-                    vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
-                }
-                #elif CV_TRY_SSE4_1
-                if (useSSE4_1)
+                #if CV_TRY_SSE4_1
+                if( useSSE4_1 )
                     opt_SSE4_1::convertMaps_32f2c16s_SSE41(src1f, dst1, dst2, size.width);
                 else
                 #endif
-                for( ; x < size.width; x++ )
                 {
-                    int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
-                    int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
-                    dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
-                    dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
-                    dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
+                    #if CV_SIMD128
+                    if( useSIMD )
+                    {
+                        v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
+                        v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
+                        v_int32x4 v_scale3 = v_setall_s32(INTER_TAB_SIZE);
+                        int span = v_uint16x8::nlanes;
+                        for (; x <= size.width - span; x += span )
+                        {
+                            v_float32x4 v_src0[2], v_src1[2];
+                            v_load_deinterleave(src1f + (x << 1), v_src0[0], v_src0[1]);
+                            v_load_deinterleave(src1f + (x << 1) + span, v_src1[0], v_src1[1]);
+                            v_int32x4 v_ix0 = v_round(v_src0[0] * v_scale);
+                            v_int32x4 v_ix1 = v_round(v_src1[0] * v_scale);
+                            v_int32x4 v_iy0 = v_round(v_src0[1] * v_scale);
+                            v_int32x4 v_iy1 = v_round(v_src1[1] * v_scale);
+
+                            v_int16x8 v_dst[2];
+                            v_dst[0] = v_pack(v_shr<INTER_BITS>(v_ix0), v_shr<INTER_BITS>(v_ix1));
+                            v_dst[1] = v_pack(v_shr<INTER_BITS>(v_iy0), v_shr<INTER_BITS>(v_iy1));
+                            v_store_interleave(dst1 + (x << 1), v_dst[0], v_dst[1]);
+
+                            v_store(dst2 + x, v_pack_u(
+                                v_muladd(v_scale3, (v_iy0 & v_mask), (v_ix0 & v_mask)),
+                                v_muladd(v_scale3, (v_iy1 & v_mask), (v_ix1 & v_mask))));
+                        }
+                    }
+                    #endif
+                    for( ; x < size.width; x++ )
+                    {
+                        int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
+                        int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
+                        dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
+                        dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
+                        dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
+                    }
                 }
             }
         }
         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
         {
-            #if CV_NEON
-            uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1);
-            uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1);
-            float32x4_t v_scale = vdupq_n_f32(scale);
-
-            for( ; x <= size.width - 8; x += 8)
+            #if CV_SIMD128
+            if( useSIMD )
             {
-                uint32x4_t v_fxy1, v_fxy2;
-                if (src2)
+                v_uint16x8 v_mask2 =  v_setall_u16(INTER_TAB_SIZE2-1);
+                v_uint32x4 v_zero =   v_setzero_u32(), v_mask = v_setall_u32(INTER_TAB_SIZE-1);
+                v_float32x4 v_scale = v_setall_f32(scale);
+                int span = v_float32x4::nlanes;
+                for( ; x <= size.width - span * 2; x += span * 2 )
                 {
-                    uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2);
-                    v_fxy1 = vmovl_u16(vget_low_u16(v_src2));
-                    v_fxy2 = vmovl_u16(vget_high_u16(v_src2));
+                    v_uint32x4 v_fxy1, v_fxy2;
+                    if ( src2 )
+                    {
+                        v_uint16x8 v_src2 = v_load(src2 + x) & v_mask2;
+                        v_expand(v_src2, v_fxy1, v_fxy2);
+                    }
+                    else
+                        v_fxy1 = v_fxy2 = v_zero;
+
+                    v_int16x8 v_src[2];
+                    v_int32x4 v_src0[2], v_src1[2];
+                    v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
+                    v_expand(v_src[0], v_src0[0], v_src0[1]);
+                    v_expand(v_src[1], v_src1[0], v_src1[1]);
+                    #define CV_COMPUTE_MAP_X(X, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) & v_mask)),\
+                                                                        v_cvt_f32(v_reinterpret_as_s32(X)))
+                    #define CV_COMPUTE_MAP_Y(Y, FXY)  v_muladd(v_scale, v_cvt_f32(v_reinterpret_as_s32((FXY) >> INTER_BITS)),\
+                                                                        v_cvt_f32(v_reinterpret_as_s32(Y)))
+                    v_float32x4 v_dst1 = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
+                    v_float32x4 v_dst2 = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
+                    v_store(dst1f + x, v_dst1);
+                    v_store(dst2f + x, v_dst2);
+
+                    v_dst1 = CV_COMPUTE_MAP_X(v_src0[1], v_fxy2);
+                    v_dst2 = CV_COMPUTE_MAP_Y(v_src1[1], v_fxy2);
+                    v_store(dst1f + x + span, v_dst1);
+                    v_store(dst2f + x + span, v_dst2);
+                    #undef CV_COMPUTE_MAP_X
+                    #undef CV_COMPUTE_MAP_Y
                 }
-                else
-                    v_fxy1 = v_fxy2 = v_zero;
-
-                int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
-                float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
-                                               v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask)));
-                float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
-                                               v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS)));
-                vst1q_f32(dst1f + x, v_dst1);
-                vst1q_f32(dst2f + x, v_dst2);
-
-                v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
-                                   v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask)));
-                v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
-                                   v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS)));
-                vst1q_f32(dst1f + x + 4, v_dst1);
-                vst1q_f32(dst2f + x + 4, v_dst2);
-            }
-            #elif CV_SSE2
-            __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
-            __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
-            __m128 v_scale = _mm_set1_ps(scale);
-
-            for( ; x <= size.width - 16; x += 16)
-            {
-                __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
-                __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
-                __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
-                __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
-
-                _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
-
-                __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
-                __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
-                _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
-                                                    _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
-                _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
-                                                    _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
-                v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
-                _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
-                                                        _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
-                _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
-                                                        _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
-
-                v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
-                v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
-                _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
-                                                        _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
-                _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
-                                                        _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
-                v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
-                _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
-                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
-                _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
-                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
             }
             #endif
             for( ; x < size.width; x++ )
@@ -2151,56 +2102,42 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         }
         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
         {
-            #if CV_NEON
-            int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1);
-            int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1);
-            float32x4_t v_scale = vdupq_n_f32(scale);
-
-            for( ; x <= size.width - 8; x += 8)
+            #if CV_SIMD128
+            if( useSIMD )
             {
-                int32x4_t v_fxy1, v_fxy2;
-                if (src2)
+                v_int16x8 v_mask2 = v_setall_s16(INTER_TAB_SIZE2-1);
+                v_int32x4 v_zero = v_setzero_s32(), v_mask = v_setall_s32(INTER_TAB_SIZE-1);
+                v_float32x4 v_scale = v_setall_f32(scale);
+                int span = v_int16x8::nlanes;
+                for( ; x <= size.width - span; x += span )
                 {
-                    int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2);
-                    v_fxy1 = vmovl_s16(vget_low_s16(v_src2));
-                    v_fxy2 = vmovl_s16(vget_high_s16(v_src2));
-                }
-                else
-                    v_fxy1 = v_fxy2 = v_zero;
-
-                int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
-                float32x4x2_t v_dst;
-                v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
-                                         v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask)));
-                v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
-                                         v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS)));
-                vst2q_f32(dst1f + (x << 1), v_dst);
-
-                v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
-                                         v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask)));
-                v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
-                                         v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
-                vst2q_f32(dst1f + (x << 1) + 8, v_dst);
-            }
-            #elif CV_SSE2
-            if (useSSE2)
-            {
-                __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
-                __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
-                __m128 v_scale = _mm_set1_ps(scale);
-
-                for ( ; x <= size.width - 8; x += 8)
-                {
-                    __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
-                    __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
-                    __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
-                    __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
-
-                    __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
-                    _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
-
-                    v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
-                    _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
+                    v_int32x4 v_fxy1, v_fxy2;
+                    if (src2)
+                    {
+                        v_int16x8 v_src2 = v_load((short *)src2 + x) & v_mask2;
+                        v_expand(v_src2, v_fxy1, v_fxy2);
+                    }
+                    else
+                        v_fxy1 = v_fxy2 = v_zero;
+
+                    v_int16x8 v_src[2];
+                    v_int32x4 v_src0[2], v_src1[2];
+                    v_float32x4 v_dst[2];
+                    v_load_deinterleave(src1 + (x << 1), v_src[0], v_src[1]);
+                    v_expand(v_src[0], v_src0[0], v_src0[1]);
+                    v_expand(v_src[1], v_src1[0], v_src1[1]);
+
+                    #define CV_COMPUTE_MAP_X(X, FXY) v_muladd(v_scale, v_cvt_f32((FXY) & v_mask), v_cvt_f32(X))
+                    #define CV_COMPUTE_MAP_Y(Y, FXY) v_muladd(v_scale, v_cvt_f32((FXY) >> INTER_BITS), v_cvt_f32(Y))
+                    v_dst[0] = CV_COMPUTE_MAP_X(v_src0[0], v_fxy1);
+                    v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[0], v_fxy1);
+                    v_store_interleave(dst1f + (x << 1), v_dst[0], v_dst[1]);
+
+                    v_dst[0] = CV_COMPUTE_MAP_X(v_src0[1], v_fxy2);
+                    v_dst[1] = CV_COMPUTE_MAP_Y(v_src1[1], v_fxy2);
+                    v_store_interleave(dst1f + (x << 1) + span, v_dst[0], v_dst[1]);
+                    #undef CV_COMPUTE_MAP_X
+                    #undef CV_COMPUTE_MAP_Y
                 }
             }
             #endif
@@ -2242,8 +2179,8 @@ public:
     #if CV_TRY_AVX2
         bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
     #endif
-    #if CV_SSE2
-        bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    #if CV_SIMD128
+        bool useSIMD = hasSIMD128();
     #endif
     #if CV_TRY_SSE4_1
         bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
@@ -2272,94 +2209,70 @@ public:
                     if( interpolation == INTER_NEAREST )
                     {
                         x1 = 0;
-                        #if CV_NEON
-                        int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
-                        for( ; x1 <= bw - 8; x1 += 8 )
-                        {
-                            int16x8x2_t v_dst;
-                            v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)),
-                                                        vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS)));
-                            v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)),
-                                                        vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS)));
-
-                            vst2q_s16(xy + (x1 << 1), v_dst);
-                        }
-                        #elif CV_TRY_SSE4_1
-                        if (useSSE4_1)
+                        #if CV_TRY_SSE4_1
+                        if( useSSE4_1 )
                             opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
                         else
                         #endif
-                        for( ; x1 < bw; x1++ )
                         {
-                            int X = (X0 + adelta[x+x1]) >> AB_BITS;
-                            int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
-                            xy[x1*2] = saturate_cast<short>(X);
-                            xy[x1*2+1] = saturate_cast<short>(Y);
+                            #if CV_SIMD128
+                            if( useSIMD )
+                            {
+                                v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
+                                int span = v_uint16x8::nlanes;
+                                for( ; x1 <= bw - span; x1 += span )
+                                {
+                                    v_int16x8 v_dst[2];
+                                    #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(shift+v_load(ptr + offset)),\
+                                                                                    v_shr<AB_BITS>(shift+v_load(ptr + offset + 4)))
+                                    v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
+                                    v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
+                                    #undef CV_CONVERT_MAP
+                                    v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
+                                }
+                            }
+                            #endif
+                            for( ; x1 < bw; x1++ )
+                            {
+                                int X = (X0 + adelta[x+x1]) >> AB_BITS;
+                                int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
+                                xy[x1*2] = saturate_cast<short>(X);
+                                xy[x1*2+1] = saturate_cast<short>(Y);
+                            }
                         }
                     }
                     else
                     {
                         short* alpha = A + y1*bw;
                         x1 = 0;
-                    #if CV_TRY_AVX2
+                        #if CV_TRY_AVX2
                         if ( useAVX2 )
                             x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
-                    #endif
-                    #if CV_SSE2
-                        if( useSSE2 )
+                        #endif
+                        #if CV_SIMD128
+                        if( useSIMD )
                         {
-                            __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
-                            __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
-                            for( ; x1 <= bw - 8; x1 += 8 )
+                            v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
+                            v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
+                            int span = v_float32x4::nlanes;
+                            for( ; x1 <= bw - span * 2; x1 += span * 2 )
                             {
-                                __m128i tx0, tx1, ty0, ty1;
-                                tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
-                                ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
-                                tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
-                                ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
-
-                                tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
-                                ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
-                                tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
-                                ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
-
-                                __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
-                                                            _mm_and_si128(tx1, fxy_mask));
-                                __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
-                                                            _mm_and_si128(ty1, fxy_mask));
-                                tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
-                                                            _mm_srai_epi32(tx1, INTER_BITS));
-                                ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
-                                                    _mm_srai_epi32(ty1, INTER_BITS));
-                                fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
-
-                                _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
-                                _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
-                                _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
+                                v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
+                                v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
+                                v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
+                                v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
+
+                                v_int16x8 v_xy[2];
+                                v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
+                                v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
+                                v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
+
+                                v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
+                                v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
+                                v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
                             }
                         }
-                    #elif CV_NEON
-                        int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
-                        for( ; x1 <= bw - 8; x1 += 8 )
-                        {
-                            int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
-                            int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
-                            int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS);
-                            int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS);
-
-                            int16x8x2_t v_xy;
-                            v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS)));
-                            v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS)));
-
-                            vst2q_s16(xy + (x1 << 1), v_xy);
-
-                            int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS),
-                                                                     vandq_s32(v_X0, v_mask)));
-                            int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS),
-                                                                     vandq_s32(v_X1, v_mask)));
-                            vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1));
-                        }
-                    #endif
+                        #endif
                         for( ; x1 < bw; x1++ )
                         {
                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
@@ -3079,8 +2992,8 @@ cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
     CV_INSTRUMENT_REGION()
 
     angle *= CV_PI/180;
-    double alpha = cos(angle)*scale;
-    double beta = sin(angle)*scale;
+    double alpha = std::cos(angle)*scale;
+    double beta = std::sin(angle)*scale;
 
     Mat M(2, 3, CV_64F);
     double* m = M.ptr<double>();
@@ -3199,30 +3112,30 @@ void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
 
     if( matM.type() == CV_32F )
     {
-        const float* M = matM.ptr<float>();
-        float* iM = _iM.ptr<float>();
+        const softfloat* M = matM.ptr<softfloat>();
+        softfloat* iM = _iM.ptr<softfloat>();
         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
 
-        double D = M[0]*M[step+1] - M[1]*M[step];
-        D = D != 0 ? 1./D : 0;
-        double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
-        double b1 = -A11*M[2] - A12*M[step+2];
-        double b2 = -A21*M[2] - A22*M[step+2];
+        softdouble D = M[0]*M[step+1] - M[1]*M[step];
+        D = D != 0. ? softdouble(1.)/D : softdouble(0.);
+        softdouble A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
+        softdouble b1 = -A11*M[2] - A12*M[step+2];
+        softdouble b2 = -A21*M[2] - A22*M[step+2];
 
-        iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
-        iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
+        iM[0] = A11; iM[1] = A12; iM[2] = b1;
+        iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
     }
     else if( matM.type() == CV_64F )
     {
-        const double* M = matM.ptr<double>();
-        double* iM = _iM.ptr<double>();
+        const softdouble* M = matM.ptr<softdouble>();
+        softdouble* iM = _iM.ptr<softdouble>();
         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
 
-        double D = M[0]*M[step+1] - M[1]*M[step];
-        D = D != 0 ? 1./D : 0;
-        double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
-        double b1 = -A11*M[2] - A12*M[step+2];
-        double b2 = -A21*M[2] - A22*M[step+2];
+        softdouble D = M[0]*M[step+1] - M[1]*M[step];
+        D = D != 0. ? softdouble(1.)/D : softdouble(0.);
+        softdouble A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
+        softdouble b1 = -A11*M[2] - A12*M[step+2];
+        softdouble b2 = -A21*M[2] - A22*M[step+2];
 
         iM[0] = A11; iM[1] = A12; iM[2] = b1;
         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
@@ -3497,8 +3410,8 @@ void cv::logPolar( InputArray _src, OutputArray _dst,
 
         for (phi = 0; phi < dsize.height; phi++)
         {
-            double cp = cos(phi * 2 * CV_PI / dsize.height);
-            double sp = sin(phi * 2 * CV_PI / dsize.height);
+            double cp = std::cos(phi * 2 * CV_PI / dsize.height);
+            double sp = std::sin(phi * 2 * CV_PI / dsize.height);
             float* mx = (float*)(mapx.data + phi*mapx.step);
             float* my = (float*)(mapy.data + phi*mapy.step);
 
@@ -3699,8 +3612,8 @@ void cv::linearPolar( InputArray _src, OutputArray _dst,
 
         for (phi = 0; phi < dsize.height; phi++)
         {
-            double cp = cos(phi * 2 * CV_PI / dsize.height);
-            double sp = sin(phi * 2 * CV_PI / dsize.height);
+            double cp = std::cos(phi * 2 * CV_PI / dsize.height);
+            double sp = std::sin(phi * 2 * CV_PI / dsize.height);
             float* mx = (float*)(mapx.data + phi*mapx.step);
             float* my = (float*)(mapy.data + phi*mapy.step);
 
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 9b1c9fa32e..87c07b9c71 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1826,7 +1826,7 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
     int lencn = len*cn;
     v_uint16x8 v_mul = v_setall_u16(*((uint16_t*)m));
     int i = 0;
-    for (; i < lencn - 15; i += 16)
+    for (; i <= lencn - 16; i += 16)
     {
         v_uint8x16 v_src = v_load(src + i);
         v_uint16x8 v_tmp0, v_tmp1;
@@ -1834,7 +1834,7 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
         v_store((uint16_t*)dst + i, v_mul*v_tmp0);
         v_store((uint16_t*)dst + i + 8, v_mul*v_tmp1);
     }
-    if (i < lencn - 7)
+    if (i <= lencn - 8)
     {
         v_uint16x8 v_src = v_load_expand(src + i);
         v_store((uint16_t*)dst + i, v_mul*v_src);
@@ -1854,7 +1854,7 @@ void hlineSmooth1N1<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const uf
 {
     int lencn = len*cn;
     int i = 0;
-    for (; i < lencn - 15; i += 16)
+    for (; i <= lencn - 16; i += 16)
     {
         v_uint8x16 v_src = v_load(src + i);
         v_uint16x8 v_tmp0, v_tmp1;
@@ -1862,7 +1862,7 @@ void hlineSmooth1N1<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const uf
         v_store((uint16_t*)dst + i, v_shl<8>(v_tmp0));
         v_store((uint16_t*)dst + i + 8, v_shl<8>(v_tmp1));
     }
-    if (i < lencn - 7)
+    if (i <= lencn - 8)
     {
         v_uint16x8 v_src = v_load_expand(src + i);
         v_store((uint16_t*)dst + i, v_shl<8>(v_src));
@@ -1930,33 +1930,17 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
 
         src += cn; dst += cn;
         int i = cn, lencn = (len - 1)*cn;
-        v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
-        v_int16x8 v_mul2 = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + 2))));
-        for (; i < lencn - 15; i += 16, src += 16, dst += 16)
+        v_uint16x8 v_mul0 = v_setall_u16(*((uint16_t*)m));
+        v_uint16x8 v_mul1 = v_setall_u16(*((uint16_t*)(m + 1)));
+        v_uint16x8 v_mul2 = v_setall_u16(*((uint16_t*)(m + 2)));
+        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
         {
-            v_uint16x8 v_src00, v_src01, v_src10, v_src11;
-            v_int16x8 v_tmp0, v_tmp1;
-
+            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
             v_expand(v_load(src - cn), v_src00, v_src01);
             v_expand(v_load(src), v_src10, v_src11);
-            v_zip(v_reinterpret_as_s16(v_src00), v_reinterpret_as_s16(v_src10), v_tmp0, v_tmp1);
-            v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul01);
-            v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul01);
-            v_zip(v_reinterpret_as_s16(v_src01), v_reinterpret_as_s16(v_src11), v_tmp0, v_tmp1);
-            v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul01);
-            v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul01);
-
-            v_int32x4 v_resj0, v_resj1, v_resj2, v_resj3;
-            v_expand(v_load(src + cn), v_src00, v_src01);
-            v_mul_expand(v_reinterpret_as_s16(v_src00), v_mul2, v_resj0, v_resj1);
-            v_mul_expand(v_reinterpret_as_s16(v_src01), v_mul2, v_resj2, v_resj3);
-            v_res0 += v_resj0;
-            v_res1 += v_resj1;
-            v_res2 += v_resj2;
-            v_res3 += v_resj3;
-
-            v_store((uint16_t*)dst, v_pack(v_reinterpret_as_u32(v_res0), v_reinterpret_as_u32(v_res1)));
-            v_store((uint16_t*)dst + 8, v_pack(v_reinterpret_as_u32(v_res2), v_reinterpret_as_u32(v_res3)));
+            v_expand(v_load(src + cn), v_src20, v_src21);
+            v_store((uint16_t*)dst, v_src00 * v_mul0 + v_src10 * v_mul1 + v_src20 * v_mul2);
+            v_store((uint16_t*)dst + 8, v_src01 * v_mul0 + v_src11 * v_mul1 + v_src21 * v_mul2);
         }
         for (; i < lencn; i++, src++, dst++)
             *dst = m[0] * src[-cn] + m[1] * src[0] + m[2] * src[cn];
@@ -1998,7 +1982,7 @@ void hlineSmooth3N121(const ET* src, int cn, const FT*, int, FT* dst, int len, i
 
         src += cn; dst += cn;
         for (int i = cn; i < (len - 1)*cn; i++, src++, dst++)
-            *dst = ((FT(src[-cn]) + FT(src[cn]))>>2) + (FT(src[0])>>1);
+            *dst = (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[0])>>1);
 
         // Point that fall right from border
         for (int k = 0; k < cn; k++)
@@ -2037,7 +2021,7 @@ void hlineSmooth3N121<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
 
         src += cn; dst += cn;
         int i = cn, lencn = (len - 1)*cn;
-        for (; i < lencn - 15; i += 16, src += 16, dst += 16)
+        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
         {
             v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
             v_expand(v_load(src - cn), v_src00, v_src01);
@@ -2061,11 +2045,107 @@ void hlineSmooth3N121<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
     }
 }
 template <typename ET, typename FT>
+void hlineSmooth3Naba(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType)
+{
+    if (len == 1)
+    {
+        FT msum = borderType != BORDER_CONSTANT ? (m[0]<<1) + m[1] : m[1];
+        for (int k = 0; k < cn; k++)
+            dst[k] = msum * src[k];
+    }
+    else
+    {
+        // Point that fall left from border
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+        {
+            int src_idx = borderInterpolate(-1, len, borderType);
+            for (int k = 0; k < cn; k++)
+                dst[k] = m[1] * src[k] + m[0] * src[cn + k] + m[0] * src[src_idx*cn + k];
+        }
+        else
+        {
+            for (int k = 0; k < cn; k++)
+                dst[k] = m[1] * src[k] + m[0] * src[cn + k];
+        }
+
+        src += cn; dst += cn;
+        for (int i = cn; i < (len - 1)*cn; i++, src++, dst++)
+            *dst = m[1] * src[0] + m[0] * src[-cn] + m[0] * src[cn];
+
+        // Point that fall right from border
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+        {
+            int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn;
+            for (int k = 0; k < cn; k++)
+                dst[k] = m[1] * src[k] + m[0] * src[k - cn] + m[0] * src[src_idx + k];
+        }
+        else
+        {
+            for (int k = 0; k < cn; k++)
+                dst[k] = m[0] * src[k - cn] + m[1] * src[k];
+        }
+    }
+}
+template <>
+void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType)
+{
+    if (len == 1)
+    {
+        ufixedpoint16 msum = borderType != BORDER_CONSTANT ? (m[0]<<1) + m[1] : m[1];
+        for (int k = 0; k < cn; k++)
+            dst[k] = msum * src[k];
+    }
+    else
+    {
+        // Point that fall left from border
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+        {
+            int src_idx = borderInterpolate(-1, len, borderType);
+            for (int k = 0; k < cn; k++)
+                ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[cn + k]) + (uint16_t)(src[src_idx*cn + k]));
+        }
+        else
+        {
+            for (int k = 0; k < cn; k++)
+                dst[k] = m[1] * src[k] + m[0] * src[cn + k];
+        }
+
+        src += cn; dst += cn;
+        int i = cn, lencn = (len - 1)*cn;
+        v_uint16x8 v_mul0 = v_setall_u16(*((uint16_t*)m));
+        v_uint16x8 v_mul1 = v_setall_u16(*((uint16_t*)m+1));
+        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
+        {
+            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+            v_expand(v_load(src - cn), v_src00, v_src01);
+            v_expand(v_load(src), v_src10, v_src11);
+            v_expand(v_load(src + cn), v_src20, v_src21);
+            v_store((uint16_t*)dst, (v_src00 + v_src20) * v_mul0 + v_src10 * v_mul1);
+            v_store((uint16_t*)dst + 8, (v_src01 + v_src21) * v_mul0 + v_src11 * v_mul1);
+        }
+        for (; i < lencn; i++, src++, dst++)
+            *((uint16_t*)dst) = ((uint16_t*)m)[1] * src[0] + ((uint16_t*)m)[0] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn]));
+
+        // Point that fall right from border
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+        {
+            int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn;
+            for (int k = 0; k < cn; k++)
+                ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[src_idx + k]));
+        }
+        else
+        {
+            for (int k = 0; k < cn; k++)
+                dst[k] = m[0] * src[k - cn] + m[1] * src[k];
+        }
+    }
+}
+template <typename ET, typename FT>
 void hlineSmooth5N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType)
 {
     if (len == 1)
     {
-        ufixedpoint16 msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] + m[3] + m[4] : m[2];
+        FT msum = borderType != BORDER_CONSTANT ? m[0] + m[1] + m[2] + m[3] + m[4] : m[2];
         for (int k = 0; k < cn; k++)
             dst[k] = msum * src[k];
     }
@@ -2228,44 +2308,21 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
 
         src += 2 * cn; dst += 2 * cn;
         int i = 2*cn, lencn = (len - 2)*cn;
-        v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
-        v_int16x8 v_mul23 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)(m + 2))));
-        v_int16x8 v_mul4 = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + 4))));
-        for (; i < lencn - 15; i += 16, src += 16, dst += 16)
+        v_uint16x8 v_mul0 = v_setall_u16(*((uint16_t*)m));
+        v_uint16x8 v_mul1 = v_setall_u16(*((uint16_t*)(m + 1)));
+        v_uint16x8 v_mul2 = v_setall_u16(*((uint16_t*)(m + 2)));
+        v_uint16x8 v_mul3 = v_setall_u16(*((uint16_t*)(m + 3)));
+        v_uint16x8 v_mul4 = v_setall_u16(*((uint16_t*)(m + 4)));
+        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
         {
-            v_uint16x8 v_src00, v_src01, v_src10, v_src11;
-            v_int16x8 v_tmp0, v_tmp1;
-
+            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21, v_src30, v_src31, v_src40, v_src41;
             v_expand(v_load(src - 2*cn), v_src00, v_src01);
             v_expand(v_load(src - cn), v_src10, v_src11);
-            v_zip(v_reinterpret_as_s16(v_src00), v_reinterpret_as_s16(v_src10), v_tmp0, v_tmp1);
-            v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul01);
-            v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul01);
-            v_zip(v_reinterpret_as_s16(v_src01), v_reinterpret_as_s16(v_src11), v_tmp0, v_tmp1);
-            v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul01);
-            v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul01);
-
-
-            v_expand(v_load(src), v_src00, v_src01);
-            v_expand(v_load(src + cn), v_src10, v_src11);
-            v_zip(v_reinterpret_as_s16(v_src00), v_reinterpret_as_s16(v_src10), v_tmp0, v_tmp1);
-            v_res0 += v_dotprod(v_tmp0, v_mul23);
-            v_res1 += v_dotprod(v_tmp1, v_mul23);
-            v_zip(v_reinterpret_as_s16(v_src01), v_reinterpret_as_s16(v_src11), v_tmp0, v_tmp1);
-            v_res2 += v_dotprod(v_tmp0, v_mul23);
-            v_res3 += v_dotprod(v_tmp1, v_mul23);
-
-            v_int32x4 v_resj0, v_resj1, v_resj2, v_resj3;
-            v_expand(v_load(src + 2*cn), v_src00, v_src01);
-            v_mul_expand(v_reinterpret_as_s16(v_src00), v_mul4, v_resj0, v_resj1);
-            v_mul_expand(v_reinterpret_as_s16(v_src01), v_mul4, v_resj2, v_resj3);
-            v_res0 += v_resj0;
-            v_res1 += v_resj1;
-            v_res2 += v_resj2;
-            v_res3 += v_resj3;
-
-            v_store((uint16_t*)dst, v_pack(v_reinterpret_as_u32(v_res0), v_reinterpret_as_u32(v_res1)));
-            v_store((uint16_t*)dst + 8, v_pack(v_reinterpret_as_u32(v_res2), v_reinterpret_as_u32(v_res3)));
+            v_expand(v_load(src), v_src20, v_src21);
+            v_expand(v_load(src + cn), v_src30, v_src31);
+            v_expand(v_load(src + 2*cn), v_src40, v_src41);
+            v_store((uint16_t*)dst, v_src00 * v_mul0 + v_src10 * v_mul1 + v_src20 * v_mul2 + v_src30 * v_mul3 + v_src40 * v_mul4);
+            v_store((uint16_t*)dst + 8, v_src01 * v_mul0 + v_src11 * v_mul1 + v_src21 * v_mul2 + v_src31 * v_mul3 + v_src41 * v_mul4);
         }
         for (; i < lencn; i++, src++, dst++)
             *dst = m[0] * src[-2*cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2*cn];
@@ -2295,7 +2352,7 @@ void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len,
     {
         if (borderType == BORDER_CONSTANT)
             for (int k = 0; k < cn; k++)
-                dst[k] = (FT(src[k])>>3)*3;
+                dst[k] = (FT(src[k])>>3)*(uint8_t)3;
         else
             for (int k = 0; k < cn; k++)
                 dst[k] = src[k];
@@ -2305,8 +2362,8 @@ void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len,
         if (borderType == BORDER_CONSTANT)
             for (int k = 0; k < cn; k++)
             {
-                dst[k] = (FT(src[k])>>4)*6 + (FT(src[k + cn])>>2);
-                dst[k + cn] = (FT(src[k]) >> 2) + (FT(src[k + cn])>>4)*6;
+                dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2);
+                dst[k + cn] = (FT(src[k]) >> 2) + (FT(src[k + cn])>>4)*(uint8_t)6;
             }
         else
         {
@@ -2316,8 +2373,8 @@ void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len,
             int idxp2 = borderInterpolate(3, len, borderType)*cn;
             for (int k = 0; k < cn; k++)
             {
-                dst[k] = (FT(src[k])>>4)*6 + (FT(src[k + idxm1])>>2) + (FT(src[k + cn])>>2) + (FT(src[k + idxp1])>>4) + (FT(src[k + idxm2])>>4);
-                dst[k + cn] = (FT(src[k + cn])>>4)*6 + (FT(src[k])>>2) + (FT(src[k + idxp1])>>2) + (FT(src[k + idxm1])>>4) + (FT(src[k + idxp2])>>4);
+                dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + idxm1])>>2) + (FT(src[k + cn])>>2) + (FT(src[k + idxp1])>>4) + (FT(src[k + idxm2])>>4);
+                dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + idxp1])>>2) + (FT(src[k + idxm1])>>4) + (FT(src[k + idxp2])>>4);
             }
         }
     }
@@ -2326,9 +2383,9 @@ void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len,
         if (borderType == BORDER_CONSTANT)
             for (int k = 0; k < cn; k++)
             {
-                dst[k] = (FT(src[k])>>4)*6 + (FT(src[k + cn])>>2) + (FT(src[k + 2 * cn])>>4);
-                dst[k + cn] = (FT(src[k + cn])>>4)*6 + (FT(src[k])>>2) + (FT(src[k + 2 * cn])>>2);
-                dst[k + 2 * cn] = (FT(src[k + 2 * cn])>>4)*6 + (FT(src[k + cn])>>2) + (FT(src[k])>>4);
+                dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + 2 * cn])>>4);
+                dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + 2 * cn])>>2);
+                dst[k + 2 * cn] = (FT(src[k + 2 * cn])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k])>>4);
             }
         else
         {
@@ -2338,9 +2395,9 @@ void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len,
             int idxp2 = borderInterpolate(4, len, borderType)*cn;
             for (int k = 0; k < cn; k++)
             {
-                dst[k] = (FT(src[k])>>4)*6 + (FT(src[k + cn])>>2) + (FT(src[k + idxm1])>>2) + (FT(src[k + 2 * cn])>>4) + (FT(src[k + idxm2])>>4);
-                dst[k + cn] = (FT(src[k + cn])>>4)*6 + (FT(src[k])>>2) + (FT(src[k + 2 * cn])>>2) + (FT(src[k + idxm1])>>4) + (FT(src[k + idxp1])>>4);
-                dst[k + 2 * cn] = (FT(src[k + 2 * cn])>>4)*6 + (FT(src[k + cn])>>2) + (FT(src[k + idxp1])>>2) + (FT(src[k])>>4) + (FT(src[k + idxp2])>>4);
+                dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + idxm1])>>2) + (FT(src[k + 2 * cn])>>4) + (FT(src[k + idxm2])>>4);
+                dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k + 2 * cn])>>2) + (FT(src[k + idxm1])>>4) + (FT(src[k + idxp1])>>4);
+                dst[k + 2 * cn] = (FT(src[k + 2 * cn])>>4)*(uint8_t)6 + (FT(src[k + cn])>>2) + (FT(src[k + idxp1])>>2) + (FT(src[k])>>4) + (FT(src[k + idxp2])>>4);
             }
         }
     }
@@ -2349,8 +2406,8 @@ void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len,
         // Points that fall left from border
         for (int k = 0; k < cn; k++)
         {
-            dst[k] = (FT(src[k])>>4)*6 + (FT(src[cn + k])>>2) + (FT(src[2 * cn + k])>>4);
-            dst[k + cn] = (FT(src[cn + k])>>4)*6 + (FT(src[k])>>2) + (FT(src[2 * cn + k])>>2) + (FT(src[3 * cn + k])>>4);
+            dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[cn + k])>>2) + (FT(src[2 * cn + k])>>4);
+            dst[k + cn] = (FT(src[cn + k])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[2 * cn + k])>>2) + (FT(src[3 * cn + k])>>4);
         }
         if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
         {
@@ -2365,13 +2422,13 @@ void hlineSmooth5N14641(const ET* src, int cn, const FT*, int, FT* dst, int len,
 
         src += 2 * cn; dst += 2 * cn;
         for (int i = 2 * cn; i < (len - 2)*cn; i++, src++, dst++)
-            *dst = (FT(src[0])>>4)*6 + (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[-2 * cn])>>4) + (FT(src[2 * cn])>>4);
+            *dst = (FT(src[0])>>4)*(uint8_t)6 + (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[-2 * cn])>>4) + (FT(src[2 * cn])>>4);
 
         // Points that fall right from border
         for (int k = 0; k < cn; k++)
         {
-            dst[k] = (FT(src[k])>>4)*6 + (FT(src[k - cn])>>2) + (FT(src[k + cn])>>2) + (FT(src[k - 2 * cn])>>4);
-            dst[k + cn] = (FT(src[k + cn])>>4)*6 + (FT(src[k])>>2) + (FT(src[k - cn])>>4);
+            dst[k] = (FT(src[k])>>4)*(uint8_t)6 + (FT(src[k - cn])>>2) + (FT(src[k + cn])>>2) + (FT(src[k - 2 * cn])>>4);
+            dst[k + cn] = (FT(src[k + cn])>>4)*(uint8_t)6 + (FT(src[k])>>2) + (FT(src[k - cn])>>4);
         }
         if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
         {
@@ -2392,7 +2449,7 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
     {
         if (borderType == BORDER_CONSTANT)
             for (int k = 0; k < cn; k++)
-                dst[k] = (ufixedpoint16(src[k])>>3) * 3;
+                dst[k] = (ufixedpoint16(src[k])>>3) * (uint8_t)3;
         else
         {
             for (int k = 0; k < cn; k++)
@@ -2404,8 +2461,8 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
         if (borderType == BORDER_CONSTANT)
             for (int k = 0; k < cn; k++)
             {
-                dst[k] = (ufixedpoint16(src[k]) >> 4) * 6 + (ufixedpoint16(src[k + cn]) >> 2);
-                dst[k + cn] = (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + cn]) >> 4) * 6;
+                dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2);
+                dst[k + cn] = (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6;
             }
         else
         {
@@ -2415,8 +2472,8 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
             int idxp2 = borderInterpolate(3, len, borderType)*cn;
             for (int k = 0; k < cn; k++)
             {
-                dst[k] = (ufixedpoint16(src[k]) >> 4) * 6 + (ufixedpoint16(src[k + idxm1]) >> 2) + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 4) + (ufixedpoint16(src[k + idxm2]) >> 4);
-                dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * 6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 4) + (ufixedpoint16(src[k + idxp2]) >> 4);
+                dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + idxm1]) >> 2) + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 4) + (ufixedpoint16(src[k + idxm2]) >> 4);
+                dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 4) + (ufixedpoint16(src[k + idxp2]) >> 4);
             }
         }
     }
@@ -2425,9 +2482,9 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
         if (borderType == BORDER_CONSTANT)
             for (int k = 0; k < cn; k++)
             {
-                dst[k] = (ufixedpoint16(src[k]) >> 4) * 6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 4);
-                dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * 6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 2);
-                dst[k + 2 * cn] = (ufixedpoint16(src[k + 2 * cn]) >> 4) * 6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k]) >> 4);
+                dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 4);
+                dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 2);
+                dst[k + 2 * cn] = (ufixedpoint16(src[k + 2 * cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k]) >> 4);
             }
         else
         {
@@ -2437,9 +2494,9 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
             int idxp2 = borderInterpolate(4, len, borderType)*cn;
             for (int k = 0; k < cn; k++)
             {
-                dst[k] = (ufixedpoint16(src[k]) >> 4) * 6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 4) + (ufixedpoint16(src[k + idxm2]) >> 4);
-                dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * 6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 4) + (ufixedpoint16(src[k + idxp1]) >> 4);
-                dst[k + 2 * cn] = (ufixedpoint16(src[k + 2 * cn]) >> 4) * 6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 2) + (ufixedpoint16(src[k]) >> 4) + (ufixedpoint16(src[k + idxp2]) >> 4);
+                dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 4) + (ufixedpoint16(src[k + idxm2]) >> 4);
+                dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k + 2 * cn]) >> 2) + (ufixedpoint16(src[k + idxm1]) >> 4) + (ufixedpoint16(src[k + idxp1]) >> 4);
+                dst[k + 2 * cn] = (ufixedpoint16(src[k + 2 * cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k + idxp1]) >> 2) + (ufixedpoint16(src[k]) >> 4) + (ufixedpoint16(src[k + idxp2]) >> 4);
             }
         }
     }
@@ -2448,8 +2505,8 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
         // Points that fall left from border
         for (int k = 0; k < cn; k++)
         {
-            dst[k] = (ufixedpoint16(src[k]) >> 4) * 6 + (ufixedpoint16(src[cn + k]) >> 2) + (ufixedpoint16(src[2 * cn + k]) >> 4);
-            dst[k + cn] = (ufixedpoint16(src[cn + k]) >> 4) * 6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[2 * cn + k]) >> 2) + (ufixedpoint16(src[3 * cn + k]) >> 4);
+            dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[cn + k]) >> 2) + (ufixedpoint16(src[2 * cn + k]) >> 4);
+            dst[k + cn] = (ufixedpoint16(src[cn + k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[2 * cn + k]) >> 2) + (ufixedpoint16(src[3 * cn + k]) >> 4);
         }
         if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
         {
@@ -2465,7 +2522,7 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
         src += 2 * cn; dst += 2 * cn;
         int i = 2 * cn, lencn = (len - 2)*cn;
         v_uint16x8 v_6 = v_setall_u16(6);
-        for (; i < lencn - 15; i += 16, src += 16, dst += 16)
+        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
         {
             v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21, v_src30, v_src31, v_src40, v_src41;
             v_expand(v_load(src - 2*cn), v_src00, v_src01);
@@ -2482,8 +2539,8 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
         // Points that fall right from border
         for (int k = 0; k < cn; k++)
         {
-            dst[k] = (ufixedpoint16(src[k]) >> 4) * 6 + (ufixedpoint16(src[k - cn]) >> 2) + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k - 2 * cn]) >> 4);
-            dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * 6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k - cn]) >> 4);
+            dst[k] = (ufixedpoint16(src[k]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k - cn]) >> 2) + (ufixedpoint16(src[k + cn]) >> 2) + (ufixedpoint16(src[k - 2 * cn]) >> 4);
+            dst[k + cn] = (ufixedpoint16(src[k + cn]) >> 4) * (uint8_t)6 + (ufixedpoint16(src[k]) >> 2) + (ufixedpoint16(src[k - cn]) >> 4);
         }
         if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
         {
@@ -2498,6 +2555,215 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
     }
 }
 template <typename ET, typename FT>
+void hlineSmooth5Nabcba(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType)
+{
+    if (len == 1)
+    {
+        FT msum = borderType != BORDER_CONSTANT ? ((m[0] + m[1])<<1) + m[2] : m[2];
+        for (int k = 0; k < cn; k++)
+            dst[k] = msum * src[k];
+    }
+    else if (len == 2)
+    {
+        if (borderType == BORDER_CONSTANT)
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = m[2] * src[k] + m[1] * src[k + cn];
+                dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn];
+            }
+        else
+        {
+            int idxm2 = borderInterpolate(-2, len, borderType)*cn;
+            int idxm1 = borderInterpolate(-1, len, borderType)*cn;
+            int idxp1 = borderInterpolate(2, len, borderType)*cn;
+            int idxp2 = borderInterpolate(3, len, borderType)*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = m[1] * src[k + idxm1] + m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + idxp1] + m[0] * src[k + idxm2];
+                dst[k + cn] = m[0] * src[k + idxm1] + m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + idxp1] + m[0] * src[k + idxp2];
+            }
+        }
+    }
+    else if (len == 3)
+    {
+        if (borderType == BORDER_CONSTANT)
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn];
+                dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + 2 * cn];
+                dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn];
+            }
+        else
+        {
+            int idxm2 = borderInterpolate(-2, len, borderType)*cn;
+            int idxm1 = borderInterpolate(-1, len, borderType)*cn;
+            int idxp1 = borderInterpolate(3, len, borderType)*cn;
+            int idxp2 = borderInterpolate(4, len, borderType)*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn] + m[0] * src[k + idxm2] + m[1] * src[k + idxm1];
+                dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn] + m[1] * src[k + 2 * cn] + m[0] * src[k + idxm1] + m[0] * src[k + idxp1];
+                dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn] + m[1] * src[k + idxp1] + m[0] * src[k + idxp2];
+            }
+        }
+    }
+    else
+    {
+        // Points that fall left from border
+        for (int k = 0; k < cn; k++)
+        {
+            dst[k] = m[2] * src[k] + m[1] * src[cn + k] + m[0] * src[2 * cn + k];
+            dst[k + cn] = m[1] * src[k] + m[2] * src[cn + k] + m[1] * src[2 * cn + k] + m[0] * src[3 * cn + k];
+        }
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+        {
+            int idxm2 = borderInterpolate(-2, len, borderType)*cn;
+            int idxm1 = borderInterpolate(-1, len, borderType)*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = dst[k] + m[0] * src[idxm2 + k] + m[1] * src[idxm1 + k];
+                dst[k + cn] = dst[k + cn] + m[0] * src[idxm1 + k];
+            }
+        }
+
+        src += 2 * cn; dst += 2 * cn;
+        for (int i = 2 * cn; i < (len - 2)*cn; i++, src++, dst++)
+            *dst = m[0] * src[-2 * cn] + m[1] * src[-cn] + m[2] * src[0] + m[3] * src[cn] + m[4] * src[2 * cn];
+
+        // Points that fall right from border
+        for (int k = 0; k < cn; k++)
+        {
+            dst[k] = m[0] * src[k - 2 * cn] + m[1] * src[k - cn] + m[2] * src[k] + m[3] * src[k + cn];
+            dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn];
+        }
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+        {
+            int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn;
+            int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = dst[k] + m[0] * src[idxp1 + k];
+                dst[k + cn] = dst[k + cn] + m[1] * src[idxp1 + k] + m[0] * src[idxp2 + k];
+            }
+        }
+    }
+}
+template <>
+void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixedpoint16* m, int, ufixedpoint16* dst, int len, int borderType)
+{
+    if (len == 1)
+    {
+        ufixedpoint16 msum = borderType != BORDER_CONSTANT ? ((m[0] + m[1]) << 1) + m[2] : m[2];
+        for (int k = 0; k < cn; k++)
+            dst[k] = msum * src[k];
+    }
+    else if (len == 2)
+    {
+        if (borderType == BORDER_CONSTANT)
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = m[2] * src[k] + m[1] * src[k + cn];
+                dst[k + cn] = m[1] * src[k] + m[2] * src[k + cn];
+            }
+        else
+        {
+            int idxm2 = borderInterpolate(-2, len, borderType)*cn;
+            int idxm1 = borderInterpolate(-1, len, borderType)*cn;
+            int idxp1 = borderInterpolate(2, len, borderType)*cn;
+            int idxp2 = borderInterpolate(3, len, borderType)*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                ((uint16_t*)dst)[k] = ((uint16_t*)m)[1] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxp1]) + (uint16_t)(src[k + idxm2]));
+                ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + idxp2])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + idxp1])) + ((uint16_t*)m)[2] * src[k + cn];
+            }
+        }
+    }
+    else if (len == 3)
+    {
+        if (borderType == BORDER_CONSTANT)
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = m[2] * src[k] + m[1] * src[k + cn] + m[0] * src[k + 2 * cn];
+                ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + 2 * cn])) + ((uint16_t*)m)[2] * src[k + cn];
+                dst[k + 2 * cn] = m[0] * src[k] + m[1] * src[k + cn] + m[2] * src[k + 2 * cn];
+            }
+        else
+        {
+            int idxm2 = borderInterpolate(-2, len, borderType)*cn;
+            int idxm1 = borderInterpolate(-1, len, borderType)*cn;
+            int idxp1 = borderInterpolate(3, len, borderType)*cn;
+            int idxp2 = borderInterpolate(4, len, borderType)*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                ((uint16_t*)dst)[k] = ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[1] * ((uint16_t)(src[k + cn]) + (uint16_t)(src[k + idxm1])) + ((uint16_t*)m)[0] * ((uint16_t)(src[k + 2 * cn]) + (uint16_t)(src[k + idxm2]));
+                ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[2] * src[k + cn] + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[k + 2 * cn])) + ((uint16_t*)m)[0] * ((uint16_t)(src[k + idxm1]) + (uint16_t)(src[k + idxp1]));
+                ((uint16_t*)dst)[k + 2 * cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k]) + (uint16_t)(src[k + idxp2])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k + cn]) + (uint16_t)(src[k + idxp1])) + ((uint16_t*)m)[2] * src[k + 2 * cn];
+            }
+        }
+    }
+    else
+    {
+        // Points that fall left from border
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+        {
+            int idxm2 = borderInterpolate(-2, len, borderType)*cn;
+            int idxm1 = borderInterpolate(-1, len, borderType)*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                ((uint16_t*)dst)[k] = ((uint16_t*)m)[2] * src[k] + ((uint16_t*)m)[1] * ((uint16_t)(src[cn + k]) + (uint16_t)(src[idxm1 + k])) + ((uint16_t*)m)[0] * ((uint16_t)(src[2 * cn + k]) + (uint16_t)(src[idxm2 + k]));
+                ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[2 * cn + k])) + ((uint16_t*)m)[2] * src[cn + k] + ((uint16_t*)m)[0] * ((uint16_t)(src[3 * cn + k]) + (uint16_t)(src[idxm1 + k]));
+            }
+        }
+        else
+        {
+            for (int k = 0; k < cn; k++)
+            {
+                dst[k] = m[2] * src[k] + m[1] * src[cn + k] + m[0] * src[2 * cn + k];
+                ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[2 * cn + k])) + ((uint16_t*)m)[2] * src[cn + k] + ((uint16_t*)m)[0] * src[3 * cn + k];
+            }
+        }
+
+        src += 2 * cn; dst += 2 * cn;
+        int i = 2 * cn, lencn = (len - 2)*cn;
+        v_uint16x8 v_mul0 = v_setall_u16(*((uint16_t*)m));
+        v_uint16x8 v_mul1 = v_setall_u16(*((uint16_t*)(m + 1)));
+        v_uint16x8 v_mul2 = v_setall_u16(*((uint16_t*)(m + 2)));
+        for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
+        {
+            v_uint16x8 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21, v_src30, v_src31, v_src40, v_src41;
+            v_expand(v_load(src - 2 * cn), v_src00, v_src01);
+            v_expand(v_load(src - cn), v_src10, v_src11);
+            v_expand(v_load(src), v_src20, v_src21);
+            v_expand(v_load(src + cn), v_src30, v_src31);
+            v_expand(v_load(src + 2 * cn), v_src40, v_src41);
+            v_store((uint16_t*)dst, (v_src00 + v_src40) * v_mul0 + (v_src10 + v_src30)* v_mul1 + v_src20 * v_mul2);
+            v_store((uint16_t*)dst + 8, (v_src01 + v_src41) * v_mul0 + (v_src11 + v_src31) * v_mul1 + v_src21 * v_mul2);
+        }
+        for (; i < lencn; i++, src++, dst++)
+            *((uint16_t*)dst) = ((uint16_t*)m)[0] * ((uint16_t)(src[-2 * cn]) + (uint16_t)(src[2 * cn])) + ((uint16_t*)m)[1] * ((uint16_t)(src[-cn]) + (uint16_t)(src[cn])) + ((uint16_t*)m)[2] * src[0];
+
+        // Points that fall right from border
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+        {
+            int idxp1 = (borderInterpolate(len, len, borderType) - (len - 2))*cn;
+            int idxp2 = (borderInterpolate(len + 1, len, borderType) - (len - 2))*cn;
+            for (int k = 0; k < cn; k++)
+            {
+                ((uint16_t*)dst)[k] = ((uint16_t*)m)[0] * ((uint16_t)(src[k - 2 * cn]) + (uint16_t)(src[idxp1 + k])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k];
+                ((uint16_t*)dst)[k + cn] = ((uint16_t*)m)[0] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[idxp2 + k])) + ((uint16_t*)m)[1] * ((uint16_t)(src[k]) + (uint16_t)(src[idxp1 + k])) + ((uint16_t*)m)[2] * src[k + cn];
+            }
+        }
+        else
+        {
+            for (int k = 0; k < cn; k++)
+            {
+                ((uint16_t*)dst)[k] = ((uint16_t*)m)[0] * src[k - 2 * cn] + ((uint16_t*)m)[1] * ((uint16_t)(src[k - cn]) + (uint16_t)(src[k + cn])) + ((uint16_t*)m)[2] * src[k];
+                dst[k + cn] = m[0] * src[k - cn] + m[1] * src[k] + m[2] * src[k + cn];
+            }
+        }
+    }
+}
+template <typename ET, typename FT>
 void hlineSmooth(const ET* src, int cn, const FT* m, int n, FT* dst, int len, int borderType)
 {
     int pre_shift = n / 2;
@@ -2582,57 +2848,157 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
     }
     i *= cn;
     int lencn = (len - post_shift + 1)*cn;
-    for (; i < lencn - 15; i+=16, src+=16, dst+=16)
+    for (; i <= lencn - 16; i+=16, src+=16, dst+=16)
     {
-        v_uint16x8 v_src00, v_src01, v_src10, v_src11;
-        v_int16x8 v_tmp0, v_tmp1;
-
-        v_int16x8 v_mul = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
-
-        v_expand(v_load(src), v_src00, v_src01);
-        v_expand(v_load(src+cn), v_src10, v_src11);
-        v_zip(v_reinterpret_as_s16(v_src00), v_reinterpret_as_s16(v_src10), v_tmp0, v_tmp1);
-        v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul);
-        v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul);
-        v_zip(v_reinterpret_as_s16(v_src01), v_reinterpret_as_s16(v_src11), v_tmp0, v_tmp1);
-        v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul);
-        v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul);
-
-        int j = 2;
-        for (; j < n - 1; j += 2)
+        v_uint16x8 v_src0, v_src1;
+        v_uint16x8 v_mul = v_setall_u16(*((uint16_t*)m));
+        v_expand(v_load(src), v_src0, v_src1);
+        v_uint16x8 v_res0 = v_src0 * v_mul;
+        v_uint16x8 v_res1 = v_src1 * v_mul;
+        for (int j = 1; j < n; j++)
         {
-            v_mul = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)(m + j))));
-
-            v_expand(v_load(src + j * cn), v_src00, v_src01);
-            v_expand(v_load(src + (j + 1) * cn), v_src10, v_src11);
-            v_zip(v_reinterpret_as_s16(v_src00), v_reinterpret_as_s16(v_src10), v_tmp0, v_tmp1);
-            v_res0 += v_dotprod(v_tmp0, v_mul);
-            v_res1 += v_dotprod(v_tmp1, v_mul);
-            v_zip(v_reinterpret_as_s16(v_src01), v_reinterpret_as_s16(v_src11), v_tmp0, v_tmp1);
-            v_res2 += v_dotprod(v_tmp0, v_mul);
-            v_res3 += v_dotprod(v_tmp1, v_mul);
+            v_mul = v_setall_u16(*((uint16_t*)(m + j)));
+            v_expand(v_load(src + j * cn), v_src0, v_src1);
+            v_res0 += v_src0 * v_mul;
+            v_res1 += v_src1 * v_mul;
         }
-        if (j < n)
+        v_store((uint16_t*)dst, v_res0);
+        v_store((uint16_t*)dst+8, v_res1);
+    }
+    for (; i < lencn; i++, src++, dst++)
+    {
+            *dst = m[0] * src[0];
+            for (int j = 1; j < n; j++)
+                *dst = *dst + m[j] * src[j*cn];
+    }
+    i /= cn;
+    for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border
+    {
+        for (int k = 0; k < cn; k++)
+            dst[k] = m[0] * src[k];
+        int j = 1;
+        for (; j < len - i; j++)
+            for (int k = 0; k < cn; k++)
+                dst[k] = dst[k] + m[j] * src[j*cn + k];
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+            for (; j < n; j++)
+            {
+                int src_idx = borderInterpolate(i + j, len, borderType) - i;
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[j] * src[src_idx*cn + k];
+            }
+    }
+}
+template <typename ET, typename FT>
+void hlineSmoothONa_yzy_a(const ET* src, int cn, const FT* m, int n, FT* dst, int len, int borderType)
+{
+    int pre_shift = n / 2;
+    int post_shift = n - pre_shift;
+    int i = 0;
+    for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border
+    {
+        for (int k = 0; k < cn; k++)
+            dst[k] = m[pre_shift - i] * src[k];
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+            for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++)
+            {
+                int src_idx = borderInterpolate(j, len, borderType);
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
+            }
+        int j, mid;
+        for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++)
+            for (int k = 0; k < cn; k++)
+                dst[k] = dst[k] + m[mid] * src[j*cn + k];
+        if (borderType != BORDER_CONSTANT)
+            for (; j < i + post_shift; j++, mid++)
+            {
+                int src_idx = borderInterpolate(j, len, borderType);
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
+            }
+    }
+    i *= cn;
+    for (; i < (len - post_shift + 1)*cn; i++, src++, dst++)
+    {
+        *dst = m[pre_shift] * src[pre_shift*cn];
+        for (int j = 0; j < pre_shift; j++)
+            *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n-1-j)*cn];
+    }
+    i /= cn;
+    for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border
+    {
+        for (int k = 0; k < cn; k++)
+            dst[k] = m[0] * src[k];
+        int j = 1;
+        for (; j < len - i; j++)
+            for (int k = 0; k < cn; k++)
+                dst[k] = dst[k] + m[j] * src[j*cn + k];
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+            for (; j < n; j++)
+            {
+                int src_idx = borderInterpolate(i + j, len, borderType) - i;
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[j] * src[src_idx*cn + k];
+            }
+    }
+}
+template <>
+void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixedpoint16* m, int n, ufixedpoint16* dst, int len, int borderType)
+{
+    int pre_shift = n / 2;
+    int post_shift = n - pre_shift;
+    int i = 0;
+    for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border
+    {
+        for (int k = 0; k < cn; k++)
+            dst[k] = m[pre_shift - i] * src[k];
+        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
+            for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++)
+            {
+                int src_idx = borderInterpolate(j, len, borderType);
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
+            }
+        int j, mid;
+        for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++)
+            for (int k = 0; k < cn; k++)
+                dst[k] = dst[k] + m[mid] * src[j*cn + k];
+        if (borderType != BORDER_CONSTANT)
+            for (; j < i + post_shift; j++, mid++)
+            {
+                int src_idx = borderInterpolate(j, len, borderType);
+                for (int k = 0; k < cn; k++)
+                    dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
+            }
+    }
+    i *= cn;
+    int lencn = (len - post_shift + 1)*cn;
+    for (; i <= lencn - 16; i += 16, src += 16, dst += 16)
+    {
+        v_uint16x8 v_src00, v_src01, v_srcN00, v_srcN01;
+
+        v_uint16x8 v_mul = v_setall_u16(*((uint16_t*)(m + pre_shift)));
+        v_expand(v_load(src + pre_shift * cn), v_src00, v_src01);
+        v_uint16x8 v_res0 = v_src00 * v_mul;
+        v_uint16x8 v_res1 = v_src01 * v_mul;
+        for (int j = 0; j < pre_shift; j ++)
         {
-            v_int32x4 v_resj0, v_resj1, v_resj2, v_resj3;
-            v_mul = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + j))));
+            v_mul = v_setall_u16(*((uint16_t*)(m + j)));
             v_expand(v_load(src + j * cn), v_src00, v_src01);
-            v_mul_expand(v_reinterpret_as_s16(v_src00), v_mul, v_resj0, v_resj1);
-            v_mul_expand(v_reinterpret_as_s16(v_src01), v_mul, v_resj2, v_resj3);
-            v_res0 += v_resj0;
-            v_res1 += v_resj1;
-            v_res2 += v_resj2;
-            v_res3 += v_resj3;
+            v_expand(v_load(src + (n - 1 - j)*cn), v_srcN00, v_srcN01);
+            v_res0 += (v_src00 + v_srcN00) * v_mul;
+            v_res1 += (v_src01 + v_srcN01) * v_mul;
         }
 
-        v_store((uint16_t*)dst, v_pack(v_reinterpret_as_u32(v_res0), v_reinterpret_as_u32(v_res1)));
-        v_store((uint16_t*)dst+8, v_pack(v_reinterpret_as_u32(v_res2), v_reinterpret_as_u32(v_res3)));
+        v_store((uint16_t*)dst, v_res0);
+        v_store((uint16_t*)dst + 8, v_res1);
     }
     for (; i < lencn; i++, src++, dst++)
     {
-            *dst = m[0] * src[0];
-            for (int j = 1; j < n; j++)
-                *dst = *dst + m[j] * src[j*cn];
+        *dst = m[pre_shift] * src[pre_shift*cn];
+        for (int j = 0; j < pre_shift; j++)
+            *dst = *dst + m[j] * src[j*cn] + m[j] * src[(n - 1 - j)*cn];
     }
     i /= cn;
     for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border
@@ -2657,20 +3023,33 @@ void vlineSmooth1N(const FT* const * src, const FT* m, int, ET* dst, int len)
 {
     const FT* src0 = src[0];
     for (int i = 0; i < len; i++)
-        dst[i] = m * src0[i];
+        dst[i] = *m * src0[i];
 }
 template <>
 void vlineSmooth1N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
     const ufixedpoint16* src0 = src[0];
     v_uint16x8 v_mul = v_setall_u16(*((uint16_t*)m));
+#if CV_SSE2
+    v_uint16x8 v_1 = v_setall_u16(1);
+    v_mul += v_mul;
+#endif
     int i = 0;
-    for (; i < len - 7; i += 8)
+    for (; i <= len - 16; i += 16)
     {
         v_uint16x8 v_src0 = v_load((uint16_t*)src0 + i);
-        v_uint32x4 v_res0, v_res1;
+        v_uint16x8 v_src1 = v_load((uint16_t*)src0 + i + 8);
+        v_uint8x16 v_res;
+#if CV_SSE2
+        v_res.val = _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(v_1.val, _mm_mulhi_epu16(v_src0.val, v_mul.val)),1),
+                                     _mm_srli_epi16(_mm_add_epi16(v_1.val, _mm_mulhi_epu16(v_src1.val, v_mul.val)),1));
+#else
+        v_uint32x4 v_res0, v_res1, v_res2, v_res3;
         v_mul_expand(v_src0, v_mul, v_res0, v_res1);
-        v_pack_store(dst + i, v_rshr_pack<16>(v_res0, v_res1));
+        v_mul_expand(v_src1, v_mul, v_res2, v_res3);
+        v_res = v_pack(v_rshr_pack<16>(v_res0, v_res1), v_rshr_pack<16>(v_res2, v_res3));
+#endif
+        v_store(dst + i, v_res);
     }
     for (; i < len; i++)
         dst[i] = m[0] * src0[i];
@@ -2687,7 +3066,7 @@ void vlineSmooth1N1<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, co
 {
     const ufixedpoint16* src0 = src[0];
     int i = 0;
-    for (; i < len - 7; i += 8)
+    for (; i <= len - 8; i += 8)
         v_rshr_pack_store<8>(dst + i, v_load((uint16_t*)(src0 + i)));
     for (; i < len; i++)
         dst[i] = src0[i];
@@ -2701,40 +3080,73 @@ void vlineSmooth3N(const FT* const * src, const FT* m, int, ET* dst, int len)
 template <>
 void vlineSmooth3N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
+    int i = 0;
     static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1 << 15));
-
     v_int32x4 v_128_4 = v_setall_s32(128 << 16);
     if (len > 7)
     {
         ufixedpoint32 val[] = { (m[0] + m[1] + m[2]) * ufixedpoint16((uint8_t)128) };
         v_128_4 = v_setall_s32(*((int32_t*)val));
     }
-
-    int i = 0;
     v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
     v_int16x8 v_mul2 = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + 2))));
-    for (; i < len - 7; i += 8)
+    for (; i <= len - 32; i += 32)
     {
-        v_int16x8 v_src0, v_src1;
+        v_int16x8 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
         v_int16x8 v_tmp0, v_tmp1;
 
-        v_src0 = v_load((int16_t*)(src[0]) + i);
-        v_src1 = v_load((int16_t*)(src[1]) + i);
-        v_zip(v_add_wrap(v_src0, v_128), v_add_wrap(v_src1, v_128), v_tmp0, v_tmp1);
+        v_src00 = v_load((int16_t*)(src[0]) + i);
+        v_src01 = v_load((int16_t*)(src[0]) + i + 8);
+        v_src02 = v_load((int16_t*)(src[0]) + i + 16);
+        v_src03 = v_load((int16_t*)(src[0]) + i + 24);
+        v_src10 = v_load((int16_t*)(src[1]) + i);
+        v_src11 = v_load((int16_t*)(src[1]) + i + 8);
+        v_src12 = v_load((int16_t*)(src[1]) + i + 16);
+        v_src13 = v_load((int16_t*)(src[1]) + i + 24);
+        v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
         v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul01);
         v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul01);
+        v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul01);
+        v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul01);
+        v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res4 = v_dotprod(v_tmp0, v_mul01);
+        v_int32x4 v_res5 = v_dotprod(v_tmp1, v_mul01);
+        v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res6 = v_dotprod(v_tmp0, v_mul01);
+        v_int32x4 v_res7 = v_dotprod(v_tmp1, v_mul01);
 
         v_int32x4 v_resj0, v_resj1;
-        v_src0 = v_load((int16_t*)(src[2]) + i);
-        v_mul_expand(v_add_wrap(v_src0, v_128), v_mul2, v_resj0, v_resj1);
+        v_src00 = v_load((int16_t*)(src[2]) + i);
+        v_src01 = v_load((int16_t*)(src[2]) + i + 8);
+        v_src02 = v_load((int16_t*)(src[2]) + i + 16);
+        v_src03 = v_load((int16_t*)(src[2]) + i + 24);
+        v_mul_expand(v_add_wrap(v_src00, v_128), v_mul2, v_resj0, v_resj1);
         v_res0 += v_resj0;
         v_res1 += v_resj1;
+        v_mul_expand(v_add_wrap(v_src01, v_128), v_mul2, v_resj0, v_resj1);
+        v_res2 += v_resj0;
+        v_res3 += v_resj1;
+        v_mul_expand(v_add_wrap(v_src02, v_128), v_mul2, v_resj0, v_resj1);
+        v_res4 += v_resj0;
+        v_res5 += v_resj1;
+        v_mul_expand(v_add_wrap(v_src03, v_128), v_mul2, v_resj0, v_resj1);
+        v_res6 += v_resj0;
+        v_res7 += v_resj1;
 
         v_res0 += v_128_4;
         v_res1 += v_128_4;
-
-        v_uint16x8 v_res = v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1));
-        v_pack_store(dst + i, v_res);
+        v_res2 += v_128_4;
+        v_res3 += v_128_4;
+        v_res4 += v_128_4;
+        v_res5 += v_128_4;
+        v_res6 += v_128_4;
+        v_res7 += v_128_4;
+
+        v_store(dst + i     , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
+                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
+        v_store(dst + i + 16, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
+                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
     }
     for (; i < len; i++)
         dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i];
@@ -2743,20 +3155,23 @@ template <typename ET, typename FT>
 void vlineSmooth3N121(const FT* const * src, const FT*, int, ET* dst, int len)
 {
     for (int i = 0; i < len; i++)
-        dst[i] = ((FT::WT(src[0][i]) + FT::WT(src[2][i])) >> 2) + (FT::WT(src[1][i]) >> 1);
+        dst[i] = (FT::WT(src[0][i]) >> 2) + (FT::WT(src[2][i]) >> 2) + (FT::WT(src[1][i]) >> 1);
 }
 template <>
 void vlineSmooth3N121<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len)
 {
     int i = 0;
-    for (; i < len - 7; i += 8)
+    for (; i <= len - 16; i += 16)
     {
-        v_uint32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
+        v_uint32x4 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
         v_expand(v_load((uint16_t*)(src[0]) + i), v_src00, v_src01);
+        v_expand(v_load((uint16_t*)(src[0]) + i + 8), v_src02, v_src03);
         v_expand(v_load((uint16_t*)(src[1]) + i), v_src10, v_src11);
+        v_expand(v_load((uint16_t*)(src[1]) + i + 8), v_src12, v_src13);
         v_expand(v_load((uint16_t*)(src[2]) + i), v_src20, v_src21);
-        v_uint16x8 v_res = v_rshr_pack<10>(v_src00 + v_src20 + (v_src10 << 1), v_src01 + v_src21 + (v_src11 << 1));
-        v_pack_store(dst + i, v_res);
+        v_expand(v_load((uint16_t*)(src[2]) + i + 8), v_src22, v_src23);
+        v_store(dst + i, v_pack(v_rshr_pack<10>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)),
+                                v_rshr_pack<10>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13))));
     }
     for (; i < len; i++)
         dst[i] = (((uint32_t)(((uint16_t*)(src[0]))[i]) + (uint32_t)(((uint16_t*)(src[2]))[i]) + ((uint32_t)(((uint16_t*)(src[1]))[i]) << 1)) + (1 << 9)) >> 10;
@@ -2770,47 +3185,95 @@ void vlineSmooth5N(const FT* const * src, const FT* m, int, ET* dst, int len)
 template <>
 void vlineSmooth5N<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int, uint8_t* dst, int len)
 {
+    int i = 0;
     static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1 << 15));
-
     v_int32x4 v_128_4 = v_setall_s32(128 << 16);
     if (len > 7)
     {
         ufixedpoint32 val[] = { (m[0] + m[1] + m[2] + m[3] + m[4]) * ufixedpoint16((uint8_t)128) };
         v_128_4 = v_setall_s32(*((int32_t*)val));
     }
-
-    int i = 0;
     v_int16x8 v_mul01 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
     v_int16x8 v_mul23 = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)(m + 2))));
     v_int16x8 v_mul4 = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + 4))));
-    for (; i < len - 7; i += 8)
+    for (; i <= len - 32; i += 32)
     {
-        v_int16x8 v_src0, v_src1;
+        v_int16x8 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
         v_int16x8 v_tmp0, v_tmp1;
 
-        v_src0 = v_load((int16_t*)(src[0]) + i);
-        v_src1 = v_load((int16_t*)(src[1]) + i);
-        v_zip(v_add_wrap(v_src0, v_128), v_add_wrap(v_src1, v_128), v_tmp0, v_tmp1);
+        v_src00 = v_load((int16_t*)(src[0]) + i);
+        v_src01 = v_load((int16_t*)(src[0]) + i + 8);
+        v_src02 = v_load((int16_t*)(src[0]) + i + 16);
+        v_src03 = v_load((int16_t*)(src[0]) + i + 24);
+        v_src10 = v_load((int16_t*)(src[1]) + i);
+        v_src11 = v_load((int16_t*)(src[1]) + i + 8);
+        v_src12 = v_load((int16_t*)(src[1]) + i + 16);
+        v_src13 = v_load((int16_t*)(src[1]) + i + 24);
+        v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
         v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul01);
         v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul01);
-
-        v_src0 = v_load((int16_t*)(src[2]) + i);
-        v_src1 = v_load((int16_t*)(src[3]) + i);
-        v_zip(v_add_wrap(v_src0, v_128), v_add_wrap(v_src1, v_128), v_tmp0, v_tmp1);
+        v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul01);
+        v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul01);
+        v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res4 = v_dotprod(v_tmp0, v_mul01);
+        v_int32x4 v_res5 = v_dotprod(v_tmp1, v_mul01);
+        v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res6 = v_dotprod(v_tmp0, v_mul01);
+        v_int32x4 v_res7 = v_dotprod(v_tmp1, v_mul01);
+
+        v_src00 = v_load((int16_t*)(src[2]) + i);
+        v_src01 = v_load((int16_t*)(src[2]) + i + 8);
+        v_src02 = v_load((int16_t*)(src[2]) + i + 16);
+        v_src03 = v_load((int16_t*)(src[2]) + i + 24);
+        v_src10 = v_load((int16_t*)(src[3]) + i);
+        v_src11 = v_load((int16_t*)(src[3]) + i + 8);
+        v_src12 = v_load((int16_t*)(src[3]) + i + 16);
+        v_src13 = v_load((int16_t*)(src[3]) + i + 24);
+        v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
         v_res0 += v_dotprod(v_tmp0, v_mul23);
         v_res1 += v_dotprod(v_tmp1, v_mul23);
+        v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
+        v_res2 += v_dotprod(v_tmp0, v_mul23);
+        v_res3 += v_dotprod(v_tmp1, v_mul23);
+        v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
+        v_res4 += v_dotprod(v_tmp0, v_mul23);
+        v_res5 += v_dotprod(v_tmp1, v_mul23);
+        v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
+        v_res6 += v_dotprod(v_tmp0, v_mul23);
+        v_res7 += v_dotprod(v_tmp1, v_mul23);
 
         v_int32x4 v_resj0, v_resj1;
-        v_src0 = v_load((int16_t*)(src[4]) + i);
-        v_mul_expand(v_add_wrap(v_src0, v_128), v_mul4, v_resj0, v_resj1);
+        v_src00 = v_load((int16_t*)(src[4]) + i);
+        v_src01 = v_load((int16_t*)(src[4]) + i + 8);
+        v_src02 = v_load((int16_t*)(src[4]) + i + 16);
+        v_src03 = v_load((int16_t*)(src[4]) + i + 24);
+        v_mul_expand(v_add_wrap(v_src00, v_128), v_mul4, v_resj0, v_resj1);
         v_res0 += v_resj0;
         v_res1 += v_resj1;
+        v_mul_expand(v_add_wrap(v_src01, v_128), v_mul4, v_resj0, v_resj1);
+        v_res2 += v_resj0;
+        v_res3 += v_resj1;
+        v_mul_expand(v_add_wrap(v_src02, v_128), v_mul4, v_resj0, v_resj1);
+        v_res4 += v_resj0;
+        v_res5 += v_resj1;
+        v_mul_expand(v_add_wrap(v_src03, v_128), v_mul4, v_resj0, v_resj1);
+        v_res6 += v_resj0;
+        v_res7 += v_resj1;
 
         v_res0 += v_128_4;
         v_res1 += v_128_4;
-
-        v_uint16x8 v_res = v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1));
-        v_pack_store(dst + i, v_res);
+        v_res2 += v_128_4;
+        v_res3 += v_128_4;
+        v_res4 += v_128_4;
+        v_res5 += v_128_4;
+        v_res6 += v_128_4;
+        v_res7 += v_128_4;
+
+        v_store(dst + i     , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
+                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
+        v_store(dst + i + 16, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
+                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
     }
     for (; i < len; i++)
         dst[i] = m[0] * src[0][i] + m[1] * src[1][i] + m[2] * src[2][i] + m[3] * src[3][i] + m[4] * src[4][i];
@@ -2819,24 +3282,33 @@ template <typename ET, typename FT>
 void vlineSmooth5N14641(const FT* const * src, const FT*, int, ET* dst, int len)
 {
     for (int i = 0; i < len; i++)
-        dst[i] = (FT::WT(src[2][i])*6 + ((FT::WT(src[1][i]) + FT::WT(src[3][i]))<<2) + FT::WT(src[0][i]) + FT::WT(src[4][i])) >> 4;
+        dst[i] = (FT::WT(src[2][i])*(uint8_t)6 + ((FT::WT(src[1][i]) + FT::WT(src[3][i]))<<2) + FT::WT(src[0][i]) + FT::WT(src[4][i])) >> 4;
 }
 template <>
 void vlineSmooth5N14641<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16*, int, uint8_t* dst, int len)
 {
     int i = 0;
     v_uint32x4 v_6 = v_setall_u32(6);
-    for (; i < len - 7; i += 8)
+    for (; i <= len - 16; i += 16)
     {
-        v_uint32x4 v_src00, v_src01, v_src10, v_src11, v_src20, v_src21, v_src30, v_src31, v_src40, v_src41;
+        v_uint32x4 v_src00, v_src10, v_src20, v_src30, v_src40;
+        v_uint32x4 v_src01, v_src11, v_src21, v_src31, v_src41;
+        v_uint32x4 v_src02, v_src12, v_src22, v_src32, v_src42;
+        v_uint32x4 v_src03, v_src13, v_src23, v_src33, v_src43;
         v_expand(v_load((uint16_t*)(src[0]) + i), v_src00, v_src01);
+        v_expand(v_load((uint16_t*)(src[0]) + i + 8), v_src02, v_src03);
         v_expand(v_load((uint16_t*)(src[1]) + i), v_src10, v_src11);
+        v_expand(v_load((uint16_t*)(src[1]) + i + 8), v_src12, v_src13);
         v_expand(v_load((uint16_t*)(src[2]) + i), v_src20, v_src21);
+        v_expand(v_load((uint16_t*)(src[2]) + i + 8), v_src22, v_src23);
         v_expand(v_load((uint16_t*)(src[3]) + i), v_src30, v_src31);
+        v_expand(v_load((uint16_t*)(src[3]) + i + 8), v_src32, v_src33);
         v_expand(v_load((uint16_t*)(src[4]) + i), v_src40, v_src41);
-        v_uint16x8 v_res = v_rshr_pack<12>(v_src20*v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40,
-                                           v_src21*v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41);
-        v_pack_store(dst + i, v_res);
+        v_expand(v_load((uint16_t*)(src[4]) + i + 8), v_src42, v_src43);
+        v_store(dst + i, v_pack(v_rshr_pack<12>(v_src20*v_6 + ((v_src10 + v_src30) << 2) + v_src00 + v_src40,
+                                                v_src21*v_6 + ((v_src11 + v_src31) << 2) + v_src01 + v_src41),
+                                v_rshr_pack<12>(v_src22*v_6 + ((v_src12 + v_src32) << 2) + v_src02 + v_src42,
+                                                v_src23*v_6 + ((v_src13 + v_src33) << 2) + v_src03 + v_src43)));
     }
     for (; i < len; i++)
         dst[i] = ((uint32_t)(((uint16_t*)(src[2]))[i]) * 6 +
@@ -2857,8 +3329,8 @@ void vlineSmooth(const FT* const * src, const FT* m, int n, ET* dst, int len)
 template <>
 void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
 {
+    int i = 0;
     static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1 << 15));
-
     v_int32x4 v_128_4 = v_setall_s32(128 << 16);
     if (len > 7)
     {
@@ -2868,46 +3340,188 @@ void vlineSmooth<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const
         ufixedpoint32 val[] = { msum * ufixedpoint16((uint8_t)128) };
         v_128_4 = v_setall_s32(*((int32_t*)val));
     }
-
-    int i = 0;
-    for (; i < len - 7; i += 8)
+    for (; i <= len - 32; i += 32)
     {
-        v_int16x8 v_src0, v_src1;
+        v_int16x8 v_src00, v_src10, v_src01, v_src11, v_src02, v_src12, v_src03, v_src13;
         v_int16x8 v_tmp0, v_tmp1;
 
         v_int16x8 v_mul = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)m)));
 
-        v_src0 = v_load((int16_t*)(src[0]) + i);
-        v_src1 = v_load((int16_t*)(src[1]) + i);
-        v_zip(v_add_wrap(v_src0, v_128), v_add_wrap(v_src1, v_128), v_tmp0, v_tmp1);
+        v_src00 = v_load((int16_t*)(src[0]) + i);
+        v_src01 = v_load((int16_t*)(src[0]) + i + 8);
+        v_src02 = v_load((int16_t*)(src[0]) + i + 16);
+        v_src03 = v_load((int16_t*)(src[0]) + i + 24);
+        v_src10 = v_load((int16_t*)(src[1]) + i);
+        v_src11 = v_load((int16_t*)(src[1]) + i + 8);
+        v_src12 = v_load((int16_t*)(src[1]) + i + 16);
+        v_src13 = v_load((int16_t*)(src[1]) + i + 24);
+        v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
         v_int32x4 v_res0 = v_dotprod(v_tmp0, v_mul);
         v_int32x4 v_res1 = v_dotprod(v_tmp1, v_mul);
+        v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res2 = v_dotprod(v_tmp0, v_mul);
+        v_int32x4 v_res3 = v_dotprod(v_tmp1, v_mul);
+        v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res4 = v_dotprod(v_tmp0, v_mul);
+        v_int32x4 v_res5 = v_dotprod(v_tmp1, v_mul);
+        v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
+        v_int32x4 v_res6 = v_dotprod(v_tmp0, v_mul);
+        v_int32x4 v_res7 = v_dotprod(v_tmp1, v_mul);
 
         int j = 2;
         for (; j < n - 1; j+=2)
         {
             v_mul = v_reinterpret_as_s16(v_setall_u32(*((uint32_t*)(m+j))));
 
-            v_src0 = v_load((int16_t*)(src[j]) + i);
-            v_src1 = v_load((int16_t*)(src[j+1]) + i);
-            v_zip(v_add_wrap(v_src0, v_128), v_add_wrap(v_src1, v_128), v_tmp0, v_tmp1);
+            v_src00 = v_load((int16_t*)(src[j]) + i);
+            v_src01 = v_load((int16_t*)(src[j]) + i + 8);
+            v_src02 = v_load((int16_t*)(src[j]) + i + 16);
+            v_src03 = v_load((int16_t*)(src[j]) + i + 24);
+            v_src10 = v_load((int16_t*)(src[j+1]) + i);
+            v_src11 = v_load((int16_t*)(src[j+1]) + i + 8);
+            v_src12 = v_load((int16_t*)(src[j+1]) + i + 16);
+            v_src13 = v_load((int16_t*)(src[j+1]) + i + 24);
+            v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src10, v_128), v_tmp0, v_tmp1);
             v_res0 += v_dotprod(v_tmp0, v_mul);
             v_res1 += v_dotprod(v_tmp1, v_mul);
+            v_zip(v_add_wrap(v_src01, v_128), v_add_wrap(v_src11, v_128), v_tmp0, v_tmp1);
+            v_res2 += v_dotprod(v_tmp0, v_mul);
+            v_res3 += v_dotprod(v_tmp1, v_mul);
+            v_zip(v_add_wrap(v_src02, v_128), v_add_wrap(v_src12, v_128), v_tmp0, v_tmp1);
+            v_res4 += v_dotprod(v_tmp0, v_mul);
+            v_res5 += v_dotprod(v_tmp1, v_mul);
+            v_zip(v_add_wrap(v_src03, v_128), v_add_wrap(v_src13, v_128), v_tmp0, v_tmp1);
+            v_res6 += v_dotprod(v_tmp0, v_mul);
+            v_res7 += v_dotprod(v_tmp1, v_mul);
         }
         if(j < n)
         {
             v_int32x4 v_resj0, v_resj1;
             v_mul = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + j))));
-            v_src0 = v_load((int16_t*)(src[j]) + i);
-            v_mul_expand(v_add_wrap(v_src0, v_128), v_mul, v_resj0, v_resj1);
+            v_src00 = v_load((int16_t*)(src[j]) + i);
+            v_src01 = v_load((int16_t*)(src[j]) + i + 8);
+            v_src02 = v_load((int16_t*)(src[j]) + i + 16);
+            v_src03 = v_load((int16_t*)(src[j]) + i + 24);
+            v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_resj0, v_resj1);
             v_res0 += v_resj0;
             v_res1 += v_resj1;
+            v_mul_expand(v_add_wrap(v_src01, v_128), v_mul, v_resj0, v_resj1);
+            v_res2 += v_resj0;
+            v_res3 += v_resj1;
+            v_mul_expand(v_add_wrap(v_src02, v_128), v_mul, v_resj0, v_resj1);
+            v_res4 += v_resj0;
+            v_res5 += v_resj1;
+            v_mul_expand(v_add_wrap(v_src03, v_128), v_mul, v_resj0, v_resj1);
+            v_res6 += v_resj0;
+            v_res7 += v_resj1;
         }
         v_res0 += v_128_4;
         v_res1 += v_128_4;
+        v_res2 += v_128_4;
+        v_res3 += v_128_4;
+        v_res4 += v_128_4;
+        v_res5 += v_128_4;
+        v_res6 += v_128_4;
+        v_res7 += v_128_4;
+
+        v_store(dst + i     , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
+                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
+        v_store(dst + i + 16, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
+                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
+    }
+    for (; i < len; i++)
+    {
+        ufixedpoint32 val = m[0] * src[0][i];
+        for (int j = 1; j < n; j++)
+        {
+            val = val + m[j] * src[j][i];
+        }
+        dst[i] = val;
+    }
+}
+template <typename ET, typename FT>
+void vlineSmoothONa_yzy_a(const FT* const * src, const FT* m, int n, ET* dst, int len)
+{
+    int pre_shift = n / 2;
+    for (int i = 0; i < len; i++)
+    {
+        typename FT::WT val = m[pre_shift] * src[pre_shift][i];
+        for (int j = 0; j < pre_shift; j++)
+            val = val + m[j] * src[j][i] + m[j] * src[(n - 1 - j)][i];
+        dst[i] = val;
+    }
+}
+template <>
+void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src, const ufixedpoint16* m, int n, uint8_t* dst, int len)
+{
+    int pre_shift = n / 2;
+    int i = 0;
+    static const v_int16x8 v_128 = v_reinterpret_as_s16(v_setall_u16((uint16_t)1 << 15));
+    v_int32x4 v_128_4 = v_setall_s32(128 << 16);
+    if (len > 7)
+    {
+        ufixedpoint16 msum = m[0] + m[pre_shift] + m[n - 1];
+        for (int j = 1; j < pre_shift; j++)
+            msum = msum + m[j] + m[n - 1 - j];
+        ufixedpoint32 val[] = { msum * ufixedpoint16((uint8_t)128) };
+        v_128_4 = v_setall_s32(*((int32_t*)val));
+    }
+    for (; i <= len - 32; i += 32)
+    {
+        v_int16x8 v_src00, v_src10, v_src20, v_src30, v_src01, v_src11, v_src21, v_src31;
+        v_int32x4 v_res0, v_res1, v_res2, v_res3, v_res4, v_res5, v_res6, v_res7;
+        v_int16x8 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7;
+
+        v_int16x8 v_mul = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + pre_shift))));
+        v_src00 = v_load((int16_t*)(src[pre_shift]) + i);
+        v_src10 = v_load((int16_t*)(src[pre_shift]) + i + 8);
+        v_src20 = v_load((int16_t*)(src[pre_shift]) + i + 16);
+        v_src30 = v_load((int16_t*)(src[pre_shift]) + i + 24);
+        v_mul_expand(v_add_wrap(v_src00, v_128), v_mul, v_res0, v_res1);
+        v_mul_expand(v_add_wrap(v_src10, v_128), v_mul, v_res2, v_res3);
+        v_mul_expand(v_add_wrap(v_src20, v_128), v_mul, v_res4, v_res5);
+        v_mul_expand(v_add_wrap(v_src30, v_128), v_mul, v_res6, v_res7);
+
+        int j = 0;
+        for (; j < pre_shift; j++)
+        {
+            v_mul = v_reinterpret_as_s16(v_setall_u16(*((uint16_t*)(m + j))));
+
+            v_src00 = v_load((int16_t*)(src[j]) + i);
+            v_src10 = v_load((int16_t*)(src[j]) + i + 8);
+            v_src20 = v_load((int16_t*)(src[j]) + i + 16);
+            v_src30 = v_load((int16_t*)(src[j]) + i + 24);
+            v_src01 = v_load((int16_t*)(src[n - 1 - j]) + i);
+            v_src11 = v_load((int16_t*)(src[n - 1 - j]) + i + 8);
+            v_src21 = v_load((int16_t*)(src[n - 1 - j]) + i + 16);
+            v_src31 = v_load((int16_t*)(src[n - 1 - j]) + i + 24);
+            v_zip(v_add_wrap(v_src00, v_128), v_add_wrap(v_src01, v_128), v_tmp0, v_tmp1);
+            v_res0 += v_dotprod(v_tmp0, v_mul);
+            v_res1 += v_dotprod(v_tmp1, v_mul);
+            v_zip(v_add_wrap(v_src10, v_128), v_add_wrap(v_src11, v_128), v_tmp2, v_tmp3);
+            v_res2 += v_dotprod(v_tmp2, v_mul);
+            v_res3 += v_dotprod(v_tmp3, v_mul);
+            v_zip(v_add_wrap(v_src20, v_128), v_add_wrap(v_src21, v_128), v_tmp4, v_tmp5);
+            v_res4 += v_dotprod(v_tmp4, v_mul);
+            v_res5 += v_dotprod(v_tmp5, v_mul);
+            v_zip(v_add_wrap(v_src30, v_128), v_add_wrap(v_src31, v_128), v_tmp6, v_tmp7);
+            v_res6 += v_dotprod(v_tmp6, v_mul);
+            v_res7 += v_dotprod(v_tmp7, v_mul);
+        }
 
-        v_uint16x8 v_res = v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1));
-        v_pack_store(dst + i, v_res);
+        v_res0 += v_128_4;
+        v_res1 += v_128_4;
+        v_res2 += v_128_4;
+        v_res3 += v_128_4;
+        v_res4 += v_128_4;
+        v_res5 += v_128_4;
+        v_res6 += v_128_4;
+        v_res7 += v_128_4;
+
+        v_store(dst + i     , v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res0, v_res1)),
+                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res2, v_res3))));
+        v_store(dst + i + 16, v_pack(v_reinterpret_as_u16(v_rshr_pack<16>(v_res4, v_res5)),
+                                     v_reinterpret_as_u16(v_rshr_pack<16>(v_res6, v_res7))));
     }
     for (; i < len; i++)
     {
@@ -2930,52 +3544,76 @@ public:
     {
         if (kxlen == 1)
         {
-            if ((kx[0] - FT::one()).isZero())
+            if (kx[0] == FT::one())
                 hlineSmoothFunc = hlineSmooth1N1;
             else
                 hlineSmoothFunc = hlineSmooth1N;
         }
         else if (kxlen == 3)
         {
-            if ((kx[0] - (FT::one()>>2)).isZero()&&(kx[1] - (FT::one()>>1)).isZero()&&(kx[2] - (FT::one()>>2)).isZero())
+            if (kx[0] == (FT::one()>>2)&&kx[1] == (FT::one()>>1)&&kx[2] == (FT::one()>>2))
                 hlineSmoothFunc = hlineSmooth3N121;
+            else if ((kx[0] - kx[2]).isZero())
+                    hlineSmoothFunc = hlineSmooth3Naba;
             else
                 hlineSmoothFunc = hlineSmooth3N;
         }
         else if (kxlen == 5)
         {
-            if ((kx[2] - (FT::one()*3>>3)).isZero()&&
-                (kx[1] - (FT::one()>>2)).isZero()&&(kx[3] - (FT::one()>>2)).isZero()&&
-                (kx[0] - (FT::one()>>4)).isZero()&&(kx[4] - (FT::one()>>4)).isZero())
+            if (kx[2] == (FT::one()*(uint8_t)3>>3) &&
+                kx[1] == (FT::one()>>2) && kx[3] == (FT::one()>>2) &&
+                kx[0] == (FT::one()>>4) && kx[4] == (FT::one()>>4))
                 hlineSmoothFunc = hlineSmooth5N14641;
+            else if (kx[0] == kx[4] && kx[1] == kx[3])
+                hlineSmoothFunc = hlineSmooth5Nabcba;
             else
                 hlineSmoothFunc = hlineSmooth5N;
         }
+        else if (kxlen % 2 == 1)
+        {
+            hlineSmoothFunc = hlineSmoothONa_yzy_a;
+            for (int i = 0; i < kxlen / 2; i++)
+                if (!(kx[i] == kx[kxlen - 1 - i]))
+                {
+                    hlineSmoothFunc = hlineSmooth;
+                    break;
+                }
+        }
         else
             hlineSmoothFunc = hlineSmooth;
         if (kylen == 1)
         {
-            if ((ky[0] - FT::one()).isZero())
+            if (ky[0] == FT::one())
                 vlineSmoothFunc = vlineSmooth1N1;
             else
                 vlineSmoothFunc = vlineSmooth1N;
         }
         else if (kylen == 3)
         {
-            if ((ky[0] - (FT::one() >> 2)).isZero() && (ky[1] - (FT::one() >> 1)).isZero() && (ky[2] - (FT::one() >> 2)).isZero())
+            if (ky[0] == (FT::one() >> 2) && ky[1] == (FT::one() >> 1) && ky[2] == (FT::one() >> 2))
                 vlineSmoothFunc = vlineSmooth3N121;
             else
                 vlineSmoothFunc = vlineSmooth3N;
         }
         else if (kylen == 5)
         {
-            if ((ky[2] - (FT::one() * 3 >> 3)).isZero() &&
-                (ky[1] - (FT::one() >> 2)).isZero() && (ky[3] - (FT::one() >> 2)).isZero() &&
-                (ky[0] - (FT::one() >> 4)).isZero() && (ky[4] - (FT::one() >> 4)).isZero())
+            if (ky[2] == (FT::one() * (uint8_t)3 >> 3) &&
+                ky[1] == (FT::one() >> 2) && ky[3] == (FT::one() >> 2) &&
+                ky[0] == (FT::one() >> 4) && ky[4] == (FT::one() >> 4))
                 vlineSmoothFunc = vlineSmooth5N14641;
             else
                 vlineSmoothFunc = vlineSmooth5N;
         }
+        else if (kylen % 2 == 1)
+        {
+            vlineSmoothFunc = vlineSmoothONa_yzy_a;
+            for (int i = 0; i < kylen / 2; i++)
+                if (!(ky[i] == ky[kylen - 1 - i]))
+                {
+                    vlineSmoothFunc = vlineSmooth;
+                    break;
+                }
+        }
         else
             vlineSmoothFunc = vlineSmooth;
     }
@@ -3520,7 +4158,7 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
         if (src.data == dst.data)
             src = src.clone();
         fixedSmoothInvoker<uint8_t, ufixedpoint16> invoker(src.ptr<uint8_t>(), src.step1(), dst.ptr<uint8_t>(), dst.step1(), dst.cols, dst.rows, dst.channels(), &fkx[0], (int)fkx.size(), &fky[0], (int)fky.size(), borderType & ~BORDER_ISOLATED);
-        parallel_for_(Range(0, dst.rows), invoker, dst.total() * cn / (double)(1 << 13));
+        parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs())));
         return;
     }
 
diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp
index cb3fee5c11..31869f1d75 100644
--- a/modules/imgproc/test/test_imgwarp_strict.cpp
+++ b/modules/imgproc/test/test_imgwarp_strict.cpp
@@ -77,6 +77,7 @@ protected:
 
     virtual void run_func() = 0;
     virtual void run_reference_func() = 0;
+    virtual float get_success_error_level(int _interpolation, int _depth) const;
     virtual void validate_results() const;
     virtual void prepare_test_data_for_reference_func();
 
@@ -229,6 +230,20 @@ void CV_ImageWarpBaseTest::run(int)
     ts->set_gtest_status();
 }
 
+float CV_ImageWarpBaseTest::get_success_error_level(int _interpolation, int) const
+{
+    if (_interpolation == INTER_CUBIC)
+        return 1.0f;
+    else if (_interpolation == INTER_LANCZOS4)
+        return 1.0f;
+    else if (_interpolation == INTER_NEAREST)
+        return 1.0f;
+    else if (_interpolation == INTER_AREA)
+        return 2.0f;
+    else
+        return 1.0f;
+}
+
 void CV_ImageWarpBaseTest::validate_results() const
 {
     Mat _dst;
@@ -237,15 +252,7 @@ void CV_ImageWarpBaseTest::validate_results() const
     Size dsize = dst.size(), ssize = src.size();
     int cn = _dst.channels();
     dsize.width *= cn;
-    float t = 1.0f;
-    if (interpolation == INTER_CUBIC)
-        t = 1.0f;
-    else if (interpolation == INTER_LANCZOS4)
-        t = 1.0f;
-    else if (interpolation == INTER_NEAREST)
-        t = 1.0f;
-    else if (interpolation == INTER_AREA)
-        t = 2.0f;
+    float t = get_success_error_level(interpolation & INTER_MAX, dst.depth());
 
     for (int dy = 0; dy < dsize.height; ++dy)
     {
@@ -1034,7 +1041,7 @@ public:
 
 protected:
     virtual void generate_test_data();
-    virtual void prepare_test_data_for_reference_func();
+    virtual float get_success_error_level(int _interpolation, int _depth) const;
 
     virtual void run_func();
     virtual void run_reference_func();
@@ -1083,16 +1090,16 @@ void CV_WarpAffine_Test::run_func()
     cv::warpAffine(src, dst, M, dst.size(), interpolation, borderType, borderValue);
 }
 
-void CV_WarpAffine_Test::prepare_test_data_for_reference_func()
+float CV_WarpAffine_Test::get_success_error_level(int _interpolation, int _depth) const
 {
-    CV_ImageWarpBaseTest::prepare_test_data_for_reference_func();
+    return _depth == CV_8U ? 0 : CV_ImageWarpBaseTest::get_success_error_level(_interpolation, _depth);
 }
 
 void CV_WarpAffine_Test::run_reference_func()
 {
-    prepare_test_data_for_reference_func();
-
-    warpAffine(src, reference_dst);
+    Mat tmp = Mat::zeros(dst.size(), dst.type());
+    warpAffine(src, tmp);
+    tmp.convertTo(reference_dst, reference_dst.depth());
 }
 
 void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst)
@@ -1123,7 +1130,7 @@ void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst)
     const int AB_SCALE = 1 << AB_BITS;
     int round_delta = (inter == INTER_NEAREST) ? AB_SCALE / 2 : (AB_SCALE / INTER_TAB_SIZE / 2);
 
-    const double* data_tM = tM.ptr<double>(0);
+    const softdouble* data_tM = tM.ptr<softdouble>(0);
     for (int dy = 0; dy < dsize.height; ++dy)
     {
         short* yM = mapx.ptr<short>(dy);
@@ -1162,6 +1169,7 @@ public:
 
 protected:
     virtual void generate_test_data();
+    virtual float get_success_error_level(int _interpolation, int _depth) const;
 
     virtual void run_func();
     virtual void run_reference_func();
@@ -1204,11 +1212,16 @@ void CV_WarpPerspective_Test::run_func()
     cv::warpPerspective(src, dst, M, dst.size(), interpolation, borderType, borderValue);
 }
 
-void CV_WarpPerspective_Test::run_reference_func()
+float CV_WarpPerspective_Test::get_success_error_level(int _interpolation, int _depth) const
 {
-    prepare_test_data_for_reference_func();
+    return CV_ImageWarpBaseTest::get_success_error_level(_interpolation, _depth);
+}
 
-    warpPerspective(src, reference_dst);
+void CV_WarpPerspective_Test::run_reference_func()
+{
+    Mat tmp = Mat::zeros(dst.size(), dst.type());
+    warpPerspective(src, tmp);
+    tmp.convertTo(reference_dst, reference_dst.depth());
 }
 
 void CV_WarpPerspective_Test::warpPerspective(const Mat& _src, Mat& _dst)
diff --git a/samples/dnn/face_detector/opencv_face_detector.pbtxt b/samples/dnn/face_detector/opencv_face_detector.pbtxt
index 78ba0bd4a4..e537e006bc 100644
--- a/samples/dnn/face_detector/opencv_face_detector.pbtxt
+++ b/samples/dnn/face_detector/opencv_face_detector.pbtxt
@@ -482,6 +482,7 @@ node {
   name: "conv4_3_norm/l2_normalize"
   op: "L2Normalize"
   input: "Relu_4:0"
+  input: "conv4_3_norm/l2_normalize/Sum/reduction_indices"
 }
 node {
   name: "conv4_3_norm/mul_1"