Merge remote-tracking branch 'upstream/3.4' into merge-3.4

7 years ago · 4b2d1aaeea
parent 7e5581cd86 225c75e4a1
commit 4b2d1aaeea
45 changed files with 2907 additions and 1299 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -316,7 +316,7 @@ OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add
 OCV_OPTION(ENABLE_COVERAGE            "Enable coverage collection with  GCov"                    OFF  IF CV_GCC )
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CV_GCC )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CV_GCC AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
-OCV_OPTION(ENABLE_VSX                 "Enable POWER8 and above VSX (64-bit little-endian)"       ON   IF (CV_GCC AND PPC64LE) )
+OCV_OPTION(ENABLE_VSX                 "Enable POWER8 and above VSX (64-bit little-endian)"       ON   IF ((CV_GCC OR CV_CLANG) AND PPC64LE) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CV_GCC AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 (NEON OR ANDROID_ARM_NEON OR AARCH64) IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF (CV_GCC OR CV_CLANG) AND (ARM OR AARCH64 OR IOS) )
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@ -46,13 +46,13 @@
 /* Cocoa API */
 #cmakedefine HAVE_COCOA

-/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
+/* NVIDIA CUDA Basic Linear Algebra Subprograms (BLAS) API*/
 #cmakedefine HAVE_CUBLAS

-/* NVidia Cuda Runtime API*/
+/* NVIDIA CUDA Runtime API*/
 #cmakedefine HAVE_CUDA

-/* NVidia Cuda Fast Fourier Transform (FFT) API*/
+/* NVIDIA CUDA Fast Fourier Transform (FFT) API*/
 #cmakedefine HAVE_CUFFT

 /* IEEE1394 capturing support */
@ -124,10 +124,10 @@
 /* Microsoft Media Foundation Capture library */
 #cmakedefine HAVE_MSMF

-/* NVidia Video Decoding API*/
+/* NVIDIA Video Decoding API*/
 #cmakedefine HAVE_NVCUVID

-/* NVidia Video Encoding API*/
+/* NVIDIA Video Encoding API*/
 #cmakedefine HAVE_NVCUVENC

 /* OpenCL Support */
--- a/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown
+++ b/doc/tutorials/gpu/gpu-basics-similarity/gpu_basics_similarity.markdown
@ -8,7 +8,7 @@ Goal
 In the @ref tutorial_video_input_psnr_ssim tutorial I already presented the PSNR and SSIM methods for checking
 the similarity between the two images. And as you could see, the execution process takes quite some
 time , especially in the case of the SSIM. However, if the performance numbers of an OpenCV
-implementation for the CPU do not satisfy you and you happen to have an NVidia CUDA GPU device in
+implementation for the CPU do not satisfy you and you happen to have an NVIDIA CUDA GPU device in
 your system, all is not lost. You may try to port or write your owm algorithm for the video card.

 This tutorial will give a good grasp on how to approach coding by using the GPU module of OpenCV. As
@ -187,7 +187,7 @@ introduce asynchronous OpenCV GPU calls too with the help of the @ref cv::cuda::
 Result and conclusion
 ---------------------

-On an Intel P8700 laptop CPU paired with a low end NVidia GT220M, here are the performance numbers:
+On an Intel P8700 laptop CPU paired with a low end NVIDIA GT220M, here are the performance numbers:
@code
 Time of PSNR CPU (averaged for 10 runs): 41.4122 milliseconds. With result of: 19.2506
 Time of PSNR GPU (averaged for 10 runs): 158.977 milliseconds. With result of: 19.2506
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -754,5 +754,6 @@ CV_EXPORTS_W void setUseIPP_NE(bool flag);

 #include "opencv2/core/neon_utils.hpp"
 #include "opencv2/core/vsx_utils.hpp"
+#include "opencv2/core/check.hpp"

 #endif //OPENCV_CORE_BASE_HPP
--- a/modules/core/include/opencv2/core/check.hpp
+++ b/modules/core/include/opencv2/core/check.hpp
@ -0,0 +1,135 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CHECK_HPP
+#define OPENCV_CORE_CHECK_HPP
+
+#include <opencv2/core/base.hpp>
+
+namespace cv {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or "<invalid depth>" */
+CV_EXPORTS const char* depthToString(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or "<invalid type>" */
+CV_EXPORTS const String typeToString(int type);
+
+
+//! @cond IGNORED
+namespace detail {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or NULL */
+CV_EXPORTS const char* depthToString_(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or cv::String() */
+CV_EXPORTS const cv::String typeToString_(int type);
+
+enum TestOp {
+  TEST_CUSTOM = 0,
+  TEST_EQ = 1,
+  TEST_NE = 2,
+  TEST_LE = 3,
+  TEST_LT = 4,
+  TEST_GE = 5,
+  TEST_GT = 6,
+  CV__LAST_TEST_OP
+};
+
+struct CheckContext {
+    const char* func;
+    const char* file;
+    int line;
+    enum TestOp testOp;
+    const char* message;
+    const char* p1_str;
+    const char* p2_str;
+};
+
+#ifndef CV__CHECK_FILENAME
+# define CV__CHECK_FILENAME __FILE__
+#endif
+
+#ifndef CV__CHECK_FUNCTION
+# if defined _MSC_VER
+#   define CV__CHECK_FUNCTION __FUNCSIG__
+# elif defined __GNUC__
+#   define CV__CHECK_FUNCTION __PRETTY_FUNCTION__
+# else
+#   define CV__CHECK_FUNCTION "<unknown>"
+# endif
+#endif
+
+#define CV__CHECK_LOCATION_VARNAME(id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_check_, id), __LINE__)
+#define CV__DEFINE_CHECK_CONTEXT(id, message, testOp, p1_str, p2_str) \
+    static const cv::detail::CheckContext CV__CHECK_LOCATION_VARNAME(id) = \
+            { CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, message, p1_str, p2_str }
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v1, const float v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v1, const double v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx);
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);
+
+
+#define CV__TEST_EQ(v1, v2) ((v1) == (v2))
+#define CV__TEST_NE(v1, v2) ((v1) != (v2))
+#define CV__TEST_LE(v1, v2) ((v1) <= (v2))
+#define CV__TEST_LT(v1, v2) ((v1) < (v2))
+#define CV__TEST_GE(v1, v2) ((v1) >= (v2))
+#define CV__TEST_GT(v1, v2) ((v1) > (v2))
+
+#define CV__CHECK(id, op, type, v1, v2, v1_str, v2_str, msg_str) do { \
+    if(CV__TEST_##op((v1), (v2))) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_ ## op, v1_str, v2_str); \
+        cv::detail::check_failed_ ## type((v1), (v2), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+#define CV__CHECK_CUSTOM_TEST(id, type, v, test_expr, v_str, test_expr_str, msg_str) do { \
+    if(!!(test_expr)) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_CUSTOM, v_str, test_expr_str); \
+        cv::detail::check_failed_ ## type((v), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+} // namespace
+//! @endcond
+
+
+/// Supported values of these types: int, float, double
+#define CV_CheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+
+/// Check with additional "decoding" of type values in error message
+#define CV_CheckTypeEQ(t1, t2, msg)  CV__CHECK(_, EQ, MatType, t1, t2, #t1, #t2, msg)
+/// Check with additional "decoding" of depth values in error message
+#define CV_CheckDepthEQ(d1, d2, msg)  CV__CHECK(_, EQ, MatDepth, d1, d2, #d1, #d2, msg)
+
+#define CV_CheckChannelsEQ(c1, c2, msg)  CV__CHECK(_, EQ, MatChannels, c1, c2, #c1, #c2, msg)
+
+
+/// Example: type == CV_8UC1 || type == CV_8UC3
+#define CV_CheckType(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatType, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: depth == CV_32F || depth == CV_64F
+#define CV_CheckDepth(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatDepth, t, (test_expr), #t, #test_expr, msg)
+
+/// Some complex conditions: CV_Check(src2, src2.empty() || (src2.type() == src1.type() && src2.size() == src1.size()), "src2 should have same size/type as src1")
+// TODO define pretty-printers: #define CV_Check(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+
+} // namespace
+
+#endif // OPENCV_CORE_CHECK_HPP
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -795,7 +795,7 @@ inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
 /** @brief Multiply and add

 Returns \f$ a*b + c \f$
-For floating point types only. */
+For floating point types and signed 32bit int only. */
 template<typename _Tp, int n>
 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
                              const v_reg<_Tp, n>& c)
@ -828,6 +828,29 @@ template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n
    return c;
 }

+/** @brief Dot product of elements
+
+Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+  {A1B1+A2B2+C1 ...} // 32-bit
+
+@endcode
+Implemented only for 16-bit signed source type (v_int16x8).
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> s;
+    for( int i = 0; i < (n/2); i++ )
+        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
+    return s;
+}
+
 /** @brief Multiply and expand

 Multiply values two registers and store results in two registers with wider pack type.
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -506,6 +506,12 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
 }

+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    v_int32x4 s = v_dotprod(a, b);
+    return v_int32x4(vaddq_s32(s.val , c.val));
+}
+
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
@ -730,6 +736,11 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
 }

+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
+}
+
 #if CV_SIMD128_64F
 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
 {
@ -1095,6 +1106,18 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
 #endif

+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    float32x4_t a_ = a.val;
+    int32x4_t result;
+    __asm__ ("fcvtns %0.4s, %1.4s"
+             : "=w"(result)
+             : "w"(a_)
+             : /* No clobbers */);
+    return v_int32x4(result);
+}
+#else
 inline v_int32x4 v_round(const v_float32x4& a)
 {
    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
@ -1103,7 +1126,7 @@ inline v_int32x4 v_round(const v_float32x4& a)
    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
 }
-
+#endif
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
    int32x4_t a1 = vcvtq_s32_f32(a.val);
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -710,6 +710,11 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
    return v_int32x4(_mm_madd_epi16(a.val, b.val));
 }

+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
+}
+
 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
@ -954,6 +959,10 @@ inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
 }
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}

 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
@ -1632,7 +1641,7 @@ inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2&
    c = v_reinterpret_as_f64(t2);
 }

-// 2-channel, float only
+// 2-channel
 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
 {
    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
@ -1644,7 +1653,29 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
    b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
 }

-inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
+inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    v_int16x8 sa, sb;
+    v_load_deinterleave((const short*)ptr, sa, sb);
+    a = v_reinterpret_as_u16(sa);
+    b = v_reinterpret_as_u16(sb);
+}
+
+inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
 {
    __m128i t0, t1;
    t0 = _mm_unpacklo_epi16(a.val, b.val);
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -1,46 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html

 #ifndef OPENCV_HAL_VSX_HPP
 #define OPENCV_HAL_VSX_HPP
@ -276,34 +236,38 @@ OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
 OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
 OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)

-#define OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(_Tpvec, _Tp, ld_func, st_func) \
+#define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a)    \
 inline _Tpvec v_load(const _Tp* ptr)                                        \
-{ return _Tpvec(ld_func(0, ptr)); }                                         \
-inline _Tpvec v_load_aligned(const _Tp* ptr)                                \
-{ return _Tpvec(ld_func(0, ptr)); }                                         \
+{ return _Tpvec(ld(0, ptr)); }                                              \
+inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr))                    \
+{ return _Tpvec(ld_a(0, ptr)); }                                            \
 inline _Tpvec v_load_low(const _Tp* ptr)                                    \
 { return _Tpvec(vec_ld_l8(ptr)); }                                          \
 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)               \
 { return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); }          \
 inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
-{ st_func(a.val, 0, ptr); }                                                 \
-inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                      \
-{ st_func(a.val, 0, ptr); }                                                 \
+{ st(a.val, 0, ptr); }                                                      \
+inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
+{ st_a(a.val, 0, ptr); }                                                    \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
 { vec_st_l8(a.val, ptr); }                                                  \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
 { vec_st_h8(a.val, ptr); }

-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint8x16, uchar, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int8x16, schar, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint16x8, ushort, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int16x8, short, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint32x4, uint, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int32x4, int, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_float32x4, float, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_float64x2, double, vsx_ld, vsx_st)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint64x2, uint64, vsx_ld2, vsx_st2)
-OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int64x2, int64, vsx_ld2, vsx_st2)
+#define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16,  uchar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16,   schar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8,  ushort)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8,   short)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4,  uint)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4,   int)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld,  vsx_ld,  vsx_st,  vsx_st)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2,  uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2,    int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)

 //////////////// Value reordering ///////////////

@ -343,7 +307,7 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
    b1.val = fl(a.val);                                           \
 }                                                                 \
 inline _Tpwvec v_load_expand(const _Tp* ptr)                      \
-{ return _Tpwvec(fh(vsx_ld(0, ptr))); }
+{ return _Tpwvec(fh(vec_ld_l8(ptr))); }

 OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
 OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
@ -353,10 +317,10 @@ OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpac
 OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)

 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{ return v_uint32x4(vec_ld_buw(ptr)); }
+{ return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }

 inline v_int32x4 v_load_expand_q(const schar* ptr)
-{ return v_int32x4(vec_ld_bsw(ptr)); }
+{ return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }

 /* pack */
 #define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack)    \
@ -429,36 +393,6 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
    d.val = vec_mergesql(a.val, b.val);
 }

-/* Extract */
-template<int s, typename _Tpvec>
-inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
-{
-    const int w = sizeof(typename _Tpvec::lane_type);
-    const int n = _Tpvec::nlanes;
-    const unsigned int sf = ((w * n) - (s * w));
-    if (s == 0)
-        return _Tpvec(a.val);
-    else if (sf > 15)
-        return _Tpvec();
-    // bitwise it just to make xlc happy
-    return _Tpvec(vec_sld(b.val, a.val, sf & 15));
-}
-
-#define OPENCV_HAL_IMPL_VSX_EXTRACT_2(_Tpvec)             \
-template<int s>                                           \
-inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
-{                                                         \
-    switch(s) {                                           \
-    case 0: return _Tpvec(a.val);                         \
-    case 2: return _Tpvec(b.val);                         \
-    case 1: return _Tpvec(vec_sldw(b.val, a.val, 2));     \
-    default: return _Tpvec();                             \
-    }                                                     \
-}
-OPENCV_HAL_IMPL_VSX_EXTRACT_2(v_uint64x2)
-OPENCV_HAL_IMPL_VSX_EXTRACT_2(v_int64x2)
-
-
 ////////// Arithmetic, bitwise and comparison operations /////////

 /* Element-wise binary and unary operations */
@ -669,6 +603,11 @@ OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
 OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  left, b, a)
 OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)

+/* Extract */
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{ return v_rotate_right<s>(a, b); }
+
 ////////// Reduce and mask /////////

 /** Reduce **/
@ -821,6 +760,9 @@ inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
 OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)

+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return a * b + c; }
+
 // TODO: exp, log, sin, cos

 /** Absolute values **/
@ -904,6 +846,9 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }

+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
+
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                            const v_float32x4& m1, const v_float32x4& m2,
                            const v_float32x4& m3)
--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@ -1,46 +1,6 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2015, Itseez Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html

 #ifndef OPENCV_HAL_VSX_UTILS_HPP
 #define OPENCV_HAL_VSX_UTILS_HPP
@ -64,106 +24,77 @@ typedef __vector unsigned char vec_uchar16;
 #define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
 #define vec_uchar16_sp(c)    (__VSX_S16__(vec_uchar16, c))
 #define vec_uchar16_c(v)     ((vec_uchar16)(v))
-#define vec_uchar16_mx       vec_uchar16_sp(0xFF)
-#define vec_uchar16_mn       vec_uchar16_sp(0)
-#define vec_uchar16_z        vec_uchar16_mn
+#define vec_uchar16_z        vec_uchar16_sp(0)

 typedef __vector signed char vec_char16;
 #define vec_char16_set(...) (vec_char16){__VA_ARGS__}
 #define vec_char16_sp(c)    (__VSX_S16__(vec_char16, c))
 #define vec_char16_c(v)     ((vec_char16)(v))
-#define vec_char16_mx       vec_char16_sp(0x7F)
-#define vec_char16_mn       vec_char16_sp(-0x7F-1)
 #define vec_char16_z        vec_char16_sp(0)

 typedef __vector unsigned short vec_ushort8;
 #define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
 #define vec_ushort8_sp(c)    (__VSX_S8__(vec_ushort8, c))
 #define vec_ushort8_c(v)     ((vec_ushort8)(v))
-#define vec_ushort8_mx       vec_ushort8_sp(0xFFFF)
-#define vec_ushort8_mn       vec_ushort8_sp(0)
-#define vec_ushort8_z        vec_ushort8_mn
+#define vec_ushort8_z        vec_ushort8_sp(0)

 typedef __vector signed short vec_short8;
 #define vec_short8_set(...) (vec_short8){__VA_ARGS__}
 #define vec_short8_sp(c)    (__VSX_S8__(vec_short8, c))
 #define vec_short8_c(v)     ((vec_short8)(v))
-#define vec_short8_mx       vec_short8_sp(0x7FFF)
-#define vec_short8_mn       vec_short8_sp(-0x7FFF-1)
 #define vec_short8_z        vec_short8_sp(0)

 typedef __vector unsigned int vec_uint4;
 #define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
 #define vec_uint4_sp(c)    (__VSX_S4__(vec_uint4, c))
 #define vec_uint4_c(v)     ((vec_uint4)(v))
-#define vec_uint4_mx       vec_uint4_sp(0xFFFFFFFFU)
-#define vec_uint4_mn       vec_uint4_sp(0)
-#define vec_uint4_z        vec_uint4_mn
+#define vec_uint4_z        vec_uint4_sp(0)

 typedef __vector signed int vec_int4;
 #define vec_int4_set(...)  (vec_int4){__VA_ARGS__}
 #define vec_int4_sp(c)     (__VSX_S4__(vec_int4, c))
 #define vec_int4_c(v)      ((vec_int4)(v))
-#define vec_int4_mx        vec_int4_sp(0x7FFFFFFF)
-#define vec_int4_mn        vec_int4_sp(-0x7FFFFFFF-1)
 #define vec_int4_z         vec_int4_sp(0)

 typedef __vector float vec_float4;
 #define vec_float4_set(...)  (vec_float4){__VA_ARGS__}
 #define vec_float4_sp(c)     (__VSX_S4__(vec_float4, c))
 #define vec_float4_c(v)      ((vec_float4)(v))
-#define vec_float4_mx        vec_float4_sp(3.40282347E+38F)
-#define vec_float4_mn        vec_float4_sp(1.17549435E-38F)
 #define vec_float4_z         vec_float4_sp(0)

 typedef __vector unsigned long long vec_udword2;
 #define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
 #define vec_udword2_sp(c)    (__VSX_S2__(vec_udword2, c))
 #define vec_udword2_c(v)     ((vec_udword2)(v))
-#define vec_udword2_mx       vec_udword2_sp(18446744073709551615ULL)
-#define vec_udword2_mn       vec_udword2_sp(0)
-#define vec_udword2_z        vec_udword2_mn
+#define vec_udword2_z        vec_udword2_sp(0)

 typedef __vector signed long long vec_dword2;
 #define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
 #define vec_dword2_sp(c)    (__VSX_S2__(vec_dword2, c))
 #define vec_dword2_c(v)     ((vec_dword2)(v))
-#define vec_dword2_mx       vec_dword2_sp(9223372036854775807LL)
-#define vec_dword2_mn       vec_dword2_sp(-9223372036854775807LL-1)
 #define vec_dword2_z        vec_dword2_sp(0)

 typedef  __vector double vec_double2;
 #define vec_double2_set(...) (vec_double2){__VA_ARGS__}
 #define vec_double2_c(v)     ((vec_double2)(v))
 #define vec_double2_sp(c)    (__VSX_S2__(vec_double2, c))
-#define vec_double2_mx       vec_double2_sp(1.7976931348623157E+308)
-#define vec_double2_mn       vec_double2_sp(2.2250738585072014E-308)
 #define vec_double2_z        vec_double2_sp(0)

 #define vec_bchar16           __vector __bool char
 #define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
 #define vec_bchar16_c(v)     ((vec_bchar16)(v))
-#define vec_bchar16_f        (__VSX_S16__(vec_bchar16, 0))
-#define vec_bchar16_t        (__VSX_S16__(vec_bchar16, 1))

 #define vec_bshort8           __vector __bool short
 #define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
 #define vec_bshort8_c(v)     ((vec_bshort8)(v))
-#define vec_bshort8_f        (__VSX_S8__(vec_bshort8, 0))
-#define vec_bshort8_t        (__VSX_S8__(vec_bshort8, 1))

 #define vec_bint4             __vector __bool int
 #define vec_bint4_set(...)   (vec_bint4){__VA_ARGS__}
 #define vec_bint4_c(v)       ((vec_bint4)(v))
-#define vec_bint4_f          (__VSX_S4__(vec_bint4, 0))
-#define vec_bint4_t          (__VSX_S4__(vec_bint4, 1))

 #define vec_bdword2            __vector __bool long long
 #define vec_bdword2_set(...)  (vec_bdword2){__VA_ARGS__}
 #define vec_bdword2_c(v)      ((vec_bdword2)(v))
-#define vec_bdword2_f         (__VSX_S2__(vec_bdword2, 0))
-#define vec_bdword2_t         (__VSX_S2__(vec_bdword2, 1))
-

 #define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))

@ -688,34 +619,17 @@ VSX_IMPL_CONV_ODD_2_4(vec_uint4,  vec_double2, vec_ctuo, vec_ctu)
    { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
 #endif

-// load 4 unsigned bytes into uint4 vector
-#define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
-
-// load 4 signed bytes into int4 vector
-#define vec_ld_bsw(p) vec_int4_set((p)[0], (p)[1], (p)[2], (p)[3])
-
-// load 4 unsigned bytes into float vector
-#define vec_ld_bps(p) vec_ctf(vec_ld_buw(p), 0)
-
 // Store lower 8 byte
 #define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)

 // Store higher 8 byte
 #define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)

-/*
- * vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
- * vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
-**/
-#define VSX_IMPL_LOAD_L8(Tvec, Tp)                                              \
-VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p)                                        \
-{ return ((Tvec)vec_promote(*((uint64*)p), 0)); }                               \
-VSX_FINLINE(Tvec) vec_ldz_l8(const Tp *p)                                       \
-{                                                                               \
-    /* TODO: try (Tvec)(vec_udword2{*((uint64*)p), 0}) */                       \
-    static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000};   \
-    return vec_and(vec_ld_l8(p), (Tvec)mask);                                   \
-}
+// Load 64-bits of integer data to lower part
+#define VSX_IMPL_LOAD_L8(Tvec, Tp)                  \
+VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p)            \
+{ return ((Tvec)vec_promote(*((uint64*)p), 0)); }
+
 VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
 VSX_IMPL_LOAD_L8(vec_char16,  schar)
 VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
@ -745,11 +659,11 @@ VSX_IMPL_LOAD_L8(vec_double2, double)
 * Implement vec_unpacklu and vec_unpackhu
 * since vec_unpackl, vec_unpackh only support signed integers
 **/
-#define VSX_IMPL_UNPACKU(rt, rg, zero)                 \
-VSX_FINLINE(rt) vec_unpacklu(const rg& a)              \
-{ return reinterpret_cast<rt>(vec_mergel(a, zero)); }  \
-VSX_FINLINE(rt) vec_unpackhu(const rg& a)              \
-{ return reinterpret_cast<rt>(vec_mergeh(a, zero));  }
+#define VSX_IMPL_UNPACKU(rt, rg, zero)      \
+VSX_FINLINE(rt) vec_unpacklu(const rg& a)   \
+{ return (rt)(vec_mergel(a, zero)); }       \
+VSX_FINLINE(rt) vec_unpackhu(const rg& a)   \
+{ return (rt)(vec_mergeh(a, zero));  }

 VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
 VSX_IMPL_UNPACKU(vec_uint4,   vec_ushort8, vec_ushort8_z)
--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@ -0,0 +1,160 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+#include "opencv2/core/check.hpp"
+
+namespace cv {
+
+const char* depthToString(int depth)
+{
+    const char* s = detail::depthToString_(depth);
+    return s ? s : "<invalid depth>";
+}
+
+const cv::String typeToString(int type)
+{
+    cv::String s = detail::typeToString_(type);
+    if (s.empty())
+    {
+        static cv::String invalidType("<invalid type>");
+        return invalidType;
+    }
+    return s;
+}
+
+
+namespace detail {
+
+static const char* getTestOpPhraseStr(unsigned testOp)
+{
+    static const char* _names[] = { "{custom check}", "equal to", "not equal to", "less than or equal to", "less than", "greater than or equal to", "greater than" };
+    CV_DbgAssert(testOp < CV__LAST_TEST_OP);
+    return testOp < CV__LAST_TEST_OP ? _names[testOp] : "???";
+}
+static const char* getTestOpMath(unsigned testOp)
+{
+    static const char* _names[] = { "???", "==", "!=", "<=", "<", ">=", ">" };
+    CV_DbgAssert(testOp < CV__LAST_TEST_OP);
+    return testOp < CV__LAST_TEST_OP ? _names[testOp] : "???";
+}
+
+const char* depthToString_(int depth)
+{
+    static const char* depthNames[] = { "CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F", "CV_USRTYPE1" };
+    return depth <= CV_USRTYPE1 ? depthNames[depth] : NULL;
+}
+
+const cv::String typeToString_(int type)
+{
+    int depth = CV_MAT_DEPTH(type);
+    int cn = CV_MAT_CN(type);
+    if (depth >= 0 && depth <= CV_USRTYPE1)
+        return cv::format("%sC%d", depthToString_(depth), cn);
+    return cv::String();
+}
+
+template<typename T> static CV_NORETURN
+void check_failed_auto_(const T& v1, const T& v2, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << " (expected: '" << ctx.p1_str << " " << getTestOpMath(ctx.testOp) << " " << ctx.p2_str << "'), where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v1 << std::endl;
+    if (ctx.testOp != TEST_CUSTOM && ctx.testOp < CV__LAST_TEST_OP)
+    {
+        ss << "must be " << getTestOpPhraseStr(ctx.testOp) << std::endl;
+    }
+    ss  << "    '" << ctx.p2_str << "' is " << v2;
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << " (expected: '" << ctx.p1_str << " " << getTestOpMath(ctx.testOp) << " " << ctx.p2_str << "'), where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v1 << " (" << depthToString(v1) << ")" << std::endl;
+    if (ctx.testOp != TEST_CUSTOM && ctx.testOp < CV__LAST_TEST_OP)
+    {
+        ss << "must be " << getTestOpPhraseStr(ctx.testOp) << std::endl;
+    }
+    ss  << "    '" << ctx.p2_str << "' is " << v2 << " (" << depthToString(v2) << ")";
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatType(const int v1, const int v2, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << " (expected: '" << ctx.p1_str << " " << getTestOpMath(ctx.testOp) << " " << ctx.p2_str << "'), where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v1 << " (" << typeToString(v1) << ")" << std::endl;
+    if (ctx.testOp != TEST_CUSTOM && ctx.testOp < CV__LAST_TEST_OP)
+    {
+        ss << "must be " << getTestOpPhraseStr(ctx.testOp) << std::endl;
+    }
+    ss  << "    '" << ctx.p2_str << "' is " << v2 << " (" << typeToString(v2) << ")";
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx)
+{
+    check_failed_auto_<int>(v1, v2, ctx);
+}
+void check_failed_auto(const int v1, const int v2, const CheckContext& ctx)
+{
+    check_failed_auto_<int>(v1, v2, ctx);
+}
+void check_failed_auto(const float v1, const float v2, const CheckContext& ctx)
+{
+    check_failed_auto_<float>(v1, v2, ctx);
+}
+void check_failed_auto(const double v1, const double v2, const CheckContext& ctx)
+{
+    check_failed_auto_<double>(v1, v2, ctx);
+}
+
+
+template<typename T> static CV_NORETURN
+void check_failed_auto_(const T& v, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << ":" << std::endl
+        << "    '" << ctx.p2_str << "'" << std::endl
+        << "where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v;
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatDepth(const int v, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << ":" << std::endl
+        << "    '" << ctx.p2_str << "'" << std::endl
+        << "where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v << " (" << depthToString(v) << ")";
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatType(const int v, const CheckContext& ctx)
+{
+    std::stringstream ss;
+    ss  << ctx.message << ":" << std::endl
+        << "    '" << ctx.p2_str << "'" << std::endl
+        << "where" << std::endl
+        << "    '" << ctx.p1_str << "' is " << v << " (" << typeToString(v) << ")";
+    cv::errorNoReturn(cv::Error::StsError, ss.str(), ctx.func, ctx.file, ctx.line);
+}
+void check_failed_MatChannels(const int v, const CheckContext& ctx)
+{
+    check_failed_auto_<int>(v, ctx);
+}
+void check_failed_auto(const int v, const CheckContext& ctx)
+{
+    check_failed_auto_<int>(v, ctx);
+}
+void check_failed_auto(const float v, const CheckContext& ctx)
+{
+    check_failed_auto_<float>(v, ctx);
+}
+void check_failed_auto(const double v, const CheckContext& ctx)
+{
+    check_failed_auto_<double>(v, ctx);
+}
+
+
+}} // namespace
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -4275,7 +4275,8 @@ public:
        entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
        Context& ctx = Context::getDefault();
        cl_int retval = CL_SUCCESS;
-        CV_OCL_CHECK_(entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval), retval);
+        entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
+        CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
        CV_Assert(entry.clBuffer_ != NULL);
        if(retval == CL_SUCCESS)
        {
@ -4669,15 +4670,19 @@ public:
                {
                    handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
                                            u->size, u->origdata, &retval);
+                    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
+                            (long long int)u->size, u->origdata, (void*)handle).c_str());
                }
                if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
                {
                    handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
                                               u->size, u->origdata, &retval);
+                    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
+                            (long long int)u->size, u->origdata, (void*)handle).c_str());
                    tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
                }
            }
-            CV_OCL_DBG_CHECK_RESULT(retval, "clCreateBuffer()");
+            CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
            if(!handle || retval != CL_SUCCESS)
                return false;
            u->handle = handle;
@ -4805,13 +4810,14 @@ public:
                            void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
                                (CL_MAP_READ | CL_MAP_WRITE),
                                0, u->size, 0, 0, 0, &retval);
-                            CV_OCL_CHECK_RESULT(retval, "clEnqueueMapBuffer()");
+                            CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
                            CV_Assert(u->origdata == data);
                            if (u->originalUMatData)
                            {
                                CV_Assert(u->originalUMatData->data == data);
                            }
-                            CV_OCL_CHECK(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0));
+                            retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
+                            CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
                            CV_OCL_DBG_CHECK(clFinish(q));
                        }
                    }
@ -4838,7 +4844,8 @@ public:
            else
 #endif
            {
-                CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
+                cl_int retval = clReleaseMemObject((cl_mem)u->handle);
+                CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
            }
            u->handle = 0;
            u->markDeviceCopyObsolete(true);
@ -4955,7 +4962,7 @@ public:
                    u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
                                                         (CL_MAP_READ | CL_MAP_WRITE),
                                                         0, u->size, 0, 0, 0, &retval);
-                    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(sz=%lld)", (int64)u->size).c_str());
+                    CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
                }
                if (u->data && retval == CL_SUCCESS)
                {
@ -4982,8 +4989,10 @@ public:
 #ifdef HAVE_OPENCL_SVM
            CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
 #endif
-            CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
-                    0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
+            cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
+                    0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
+            CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
+                    (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
            u->markHostCopyObsolete(false);
        }
    }
@ -5032,7 +5041,8 @@ public:
            if (u->refcount == 0)
            {
                CV_Assert(u->mapcount-- == 1);
-                CV_OCL_CHECK(retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0));
+                retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
+                CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
                if (Device::getDefault().isAMD())
                {
                    // required for multithreaded applications (see stitching test)
@ -5050,8 +5060,10 @@ public:
 #ifdef HAVE_OPENCL_SVM
            CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
 #endif
-            CV_OCL_CHECK(retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
-                                0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
+            retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
+                                0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
+            CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
+                    (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
            u->markDeviceCopyObsolete(false);
            u->markHostCopyObsolete(true);
        }
@ -5354,8 +5366,10 @@ public:
            if( iscontinuous )
            {
                AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
-                CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
-                    dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
+                cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
+                    dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
+                CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
+                        (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
            }
            else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
            {
@ -5527,8 +5541,10 @@ public:
        {
            if( iscontinuous )
            {
-                CV_OCL_CHECK(retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
-                                               srcrawofs, dstrawofs, total, 0, 0, 0));
+                retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
+                                               srcrawofs, dstrawofs, total, 0, 0, 0);
+                CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
+                        (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
            }
            else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
            {
@ -6373,7 +6389,9 @@ struct Image2D::Impl
        if (!alias && !src.isContinuous())
        {
            devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
-            CV_OCL_CHECK_RESULT(err, "clCreateBuffer()");
+            CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
+                    (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
+                ).c_str());

            const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
            CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -249,10 +249,34 @@ const char* Exception::what() const throw() { return msg.c_str(); }

 void Exception::formatMessage()
 {
-    if( func.size() > 0 )
-        msg = format("OpenCV(%s) %s:%d: error: (%d) %s: %s in function %s\n", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), err.c_str(), func.c_str());
+    size_t pos = err.find('\n');
+    bool multiline = pos != cv::String::npos;
+    if (multiline)
+    {
+        std::stringstream ss;
+        size_t prev_pos = 0;
+        while (pos != cv::String::npos)
+        {
+           ss << "> " << err.substr(prev_pos, pos - prev_pos) << std::endl;
+           prev_pos = pos + 1;
+           pos = err.find('\n', prev_pos);
+        }
+        ss << "> " << err.substr(prev_pos);
+        if (err[err.size() - 1] != '\n')
+            ss << std::endl;
+        err = ss.str();
+    }
+    if (func.size() > 0)
+    {
+        if (multiline)
+            msg = format("OpenCV(%s) %s:%d: error: (%d:%s) in function '%s'\n%s", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), func.c_str(), err.c_str());
+        else
+            msg = format("OpenCV(%s) %s:%d: error: (%d:%s) %s in function '%s'\n", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), err.c_str(), func.c_str());
+    }
    else
-        msg = format("OpenCV(%s) %s:%d: error: (%d) %s: %s\n", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), err.c_str());
+    {
+        msg = format("OpenCV(%s) %s:%d: error: (%d:%s) %s%s", CV_VERSION, file.c_str(), line, code, cvErrorStr(code), err.c_str(), multiline ? "" : "\n");
+    }
 }

 static const char* g_hwFeatureNames[CV_HARDWARE_MAX_FEATURE] = { NULL };
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -521,15 +521,25 @@ template<typename R> struct TheTest
    TheTest & test_dot_prod()
    {
        typedef typename V_RegTrait128<LaneType>::w_reg Rx2;
+        typedef typename Rx2::lane_type w_type;
+
        Data<R> dataA, dataB(2);
        R a = dataA, b = dataB;

-        Data<Rx2> res = v_dotprod(a, b);
+        Data<Rx2> dataC;
+        dataC += std::numeric_limits<w_type>::is_signed ?
+                    std::numeric_limits<w_type>::min() :
+                    std::numeric_limits<w_type>::max() - R::nlanes * (dataB[0] + 1);
+        Rx2 c = dataC;
+
+        Data<Rx2> resD = v_dotprod(a, b),
+                  resE = v_dotprod(a, b, c);

        const int n = R::nlanes / 2;
        for (int i = 0; i < n; ++i)
        {
-            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], res[i]);
+            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1], resD[i]);
+            EXPECT_EQ(dataA[i*2] * dataB[i*2] + dataA[i*2 + 1] * dataB[i*2 + 1] + dataC[i], resE[i]);
        }
        return *this;
    }
--- a/modules/core/test/test_misc.cpp
+++ b/modules/core/test/test_misc.cpp
@ -245,4 +245,503 @@ TEST(Core_Version, consistency)
    EXPECT_EQ(String(CV_VERSION), cv::getVersionString());
 }

+
+
+//
+// Test core/check.hpp macros
+//
+
+void test_check_eq_1(int value_1, int value_2)
+{
+    CV_CheckEQ(value_1, value_2, "Validation check failed");
+}
+TEST(Core_Check, testEQ_int_fail)
+{
+    try
+    {
+        test_check_eq_1(123, 5678);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation check failed (expected: 'value_1 == value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be equal to\n"
+">     'value_2' is 5678\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testEQ_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_eq_1(1234, 1234);
+    });
+}
+
+
+void test_check_eq_2(float value_1, float value_2)
+{
+    CV_CheckEQ(value_1, value_2, "Validation check failed (float)");
+}
+TEST(Core_Check, testEQ_float_fail)
+{
+    try
+    {
+        test_check_eq_2(1234.5f, 1234.55f);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation check failed (float) (expected: 'value_1 == value_2'), where\n"
+">     'value_1' is 1234.5\n"  // TODO Locale handling (use LC_ALL=C on Linux)
+"> must be equal to\n"
+">     'value_2' is 1234.55\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testEQ_float_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_eq_2(1234.6f, 1234.6f);
+    });
+}
+
+
+void test_check_eq_3(double value_1, double value_2)
+{
+    CV_CheckEQ(value_1, value_2, "Validation check failed (double)");
+}
+TEST(Core_Check, testEQ_double_fail)
+{
+    try
+    {
+        test_check_eq_3(1234.5, 1234.56);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation check failed (double) (expected: 'value_1 == value_2'), where\n"
+">     'value_1' is 1234.5\n"  // TODO Locale handling (use LC_ALL=C on Linux)
+"> must be equal to\n"
+">     'value_2' is 1234.56\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testEQ_double_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_eq_3(1234.0f, 1234.0f);
+    });
+}
+
+
+void test_check_ne_1(int value_1, int value_2)
+{
+    CV_CheckNE(value_1, value_2, "Validation NE check failed");
+}
+TEST(Core_Check, testNE_int_fail)
+{
+    try
+    {
+        test_check_ne_1(123, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation NE check failed (expected: 'value_1 != value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be not equal to\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testNE_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_ne_1(123, 1234);
+    });
+}
+
+
+void test_check_le_1(int value_1, int value_2)
+{
+    CV_CheckLE(value_1, value_2, "Validation LE check failed");
+}
+TEST(Core_Check, testLE_int_fail)
+{
+    try
+    {
+        test_check_le_1(1234, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation LE check failed (expected: 'value_1 <= value_2'), where\n"
+">     'value_1' is 1234\n"
+"> must be less than or equal to\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testLE_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_le_1(1234, 1234);
+    });
+    EXPECT_NO_THROW(
+    {
+        test_check_le_1(123, 1234);
+    });
+}
+
+void test_check_lt_1(int value_1, int value_2)
+{
+    CV_CheckLT(value_1, value_2, "Validation LT check failed");
+}
+TEST(Core_Check, testLT_int_fail)
+{
+    try
+    {
+        test_check_lt_1(1234, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation LT check failed (expected: 'value_1 < value_2'), where\n"
+">     'value_1' is 1234\n"
+"> must be less than\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testLT_int_fail_eq)
+{
+    try
+    {
+        test_check_lt_1(123, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation LT check failed (expected: 'value_1 < value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be less than\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testLT_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_lt_1(123, 1234);
+    });
+}
+
+
+void test_check_ge_1(int value_1, int value_2)
+{
+    CV_CheckGE(value_1, value_2, "Validation GE check failed");
+}
+TEST(Core_Check, testGE_int_fail)
+{
+    try
+    {
+        test_check_ge_1(123, 1234);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation GE check failed (expected: 'value_1 >= value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be greater than or equal to\n"
+">     'value_2' is 1234\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testGE_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_ge_1(1234, 1234);
+    });
+    EXPECT_NO_THROW(
+    {
+        test_check_ge_1(1234, 123);
+    });
+}
+
+void test_check_gt_1(int value_1, int value_2)
+{
+    CV_CheckGT(value_1, value_2, "Validation GT check failed");
+}
+TEST(Core_Check, testGT_int_fail)
+{
+    try
+    {
+        test_check_gt_1(123, 1234);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation GT check failed (expected: 'value_1 > value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be greater than\n"
+">     'value_2' is 1234\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testGT_int_fail_eq)
+{
+    try
+    {
+        test_check_gt_1(123, 123);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Validation GT check failed (expected: 'value_1 > value_2'), where\n"
+">     'value_1' is 123\n"
+"> must be greater than\n"
+">     'value_2' is 123\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+TEST(Core_Check, testGT_int_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_gt_1(1234, 123);
+    });
+}
+
+
+void test_check_MatType_1(int src_type)
+{
+    CV_CheckTypeEQ(src_type, CV_32FC1, "Unsupported source type");
+}
+TEST(Core_Check, testMatType_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_MatType_1(CV_MAKE_TYPE(CV_32F, 1));
+    });
+}
+TEST(Core_Check, testMatType_fail_1)
+{
+    try
+    {
+        test_check_MatType_1(CV_8UC1);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Unsupported source type (expected: 'src_type == CV_32FC1'), where\n"
+">     'src_type' is 0 (CV_8UC1)\n"
+"> must be equal to\n"
+">     'CV_32FC1' is 5 (CV_32FC1)\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+
+void test_check_MatType_2(int src_type)
+{
+    CV_CheckType(src_type, src_type == CV_32FC1 || src_type == CV_32FC3, "Unsupported src");
+}
+TEST(Core_Check, testMatType_fail_2)
+{
+    try
+    {
+        test_check_MatType_2(CV_8UC1);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Unsupported src:\n"
+">     'src_type == CV_32FC1 || src_type == CV_32FC3'\n"
+"> where\n>     'src_type' is 0 (CV_8UC1)\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+
+void test_check_MatDepth_1(int src_depth)
+{
+    CV_CheckDepthEQ(src_depth, CV_32F, "Unsupported source depth");
+}
+TEST(Core_Check, testMatDepth_pass)
+{
+    EXPECT_NO_THROW(
+    {
+        test_check_MatDepth_1(CV_MAKE_TYPE(CV_32F, 1));
+    });
+}
+TEST(Core_Check, testMatDepth_fail_1)
+{
+    try
+    {
+        test_check_MatDepth_1(CV_8U);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Unsupported source depth (expected: 'src_depth == CV_32F'), where\n"
+">     'src_depth' is 0 (CV_8U)\n"
+"> must be equal to\n"
+">     'CV_32F' is 5 (CV_32F)\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+
+void test_check_MatDepth_2(int src_depth)
+{
+    CV_CheckDepth(src_depth, src_depth == CV_32F || src_depth == CV_64F, "Unsupported src");
+}
+TEST(Core_Check, testMatDepth_fail_2)
+{
+    try
+    {
+        test_check_MatDepth_2(CV_8U);
+        FAIL() << "Unreachable code called";
+    }
+    catch (const cv::Exception& e)
+    {
+        EXPECT_STREQ(e.err.c_str(),
+"> Unsupported src:\n"
+">     'src_depth == CV_32F || src_depth == CV_64F'\n"
+"> where\n>     'src_depth' is 0 (CV_8U)\n"
+);
+    }
+    catch (const std::exception& e)
+    {
+        FAIL() << "Unexpected C++ exception: " << e.what();
+    }
+    catch (...)
+    {
+        FAIL() << "Unexpected unknown exception";
+    }
+}
+
+
 }} // namespace
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -559,7 +559,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    {
    public:
        float pnorm, epsilon;
-        bool acrossSpatial;
+        CV_DEPRECATED bool acrossSpatial;

        static Ptr<NormalizeBBoxLayer> create(const LayerParams& params);
    };
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -80,7 +80,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
    enum Target
    {
        DNN_TARGET_CPU,
-        DNN_TARGET_OPENCL
+        DNN_TARGET_OPENCL,
+        DNN_TARGET_OPENCL_FP16
    };

    /** @brief This class provides all data needed to initialize layer.
--- a/modules/dnn/misc/quantize_face_detector.py
+++ b/modules/dnn/misc/quantize_face_detector.py
@ -318,6 +318,7 @@ for node in graph_def.node:
        node.input.pop()
        node.input.pop()
        node.input.append(layer_256_1_relu1.name)
+        node.input.append('conv4_3_norm/l2_normalize/Sum/reduction_indices')
        break

 softmaxShape = NodeDef()
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@ -13,7 +13,7 @@
 namespace opencv_test {

 CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE)
-CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL)
+CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)

 class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<DNNBackend, DNNTarget> >
 {
@ -41,8 +41,6 @@ public:
                throw cvtest::SkipTestException("OpenCL is not available/disabled in OpenCV");
            }
        }
-        if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
-            throw SkipTestException("Skip OpenCL target of Inference Engine backend");

        randu(input, 0.0f, 1.0f);

@ -89,24 +87,32 @@ public:

 PERF_TEST_P_(DNNTestNetwork, AlexNet)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
            "alexnet.yml", Mat(cv::Size(227, 227), CV_32FC3));
 }

 PERF_TEST_P_(DNNTestNetwork, GoogLeNet)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
            "", Mat(cv::Size(224, 224), CV_32FC3));
 }

 PERF_TEST_P_(DNNTestNetwork, ResNet_50)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
            "resnet_50.yml", Mat(cv::Size(224, 224), CV_32FC3));
 }

 PERF_TEST_P_(DNNTestNetwork, SqueezeNet_v1_1)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
            "squeezenet_v1_1.yml", Mat(cv::Size(227, 227), CV_32FC3));
 }
@ -135,14 +141,18 @@ PERF_TEST_P_(DNNTestNetwork, SSD)

 PERF_TEST_P_(DNNTestNetwork, OpenFace)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    processNet("dnn/openface_nn4.small2.v1.t7", "", "",
            Mat(cv::Size(96, 96), CV_32FC3));
 }

 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
            Mat(cv::Size(300, 300), CV_32FC3));
 }
@ -150,7 +160,8 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
 PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_TensorFlow)
 {
    if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL ||
-        backend == DNN_BACKEND_HALIDE)
+        backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
        throw SkipTestException("");
    processNet("dnn/ssd_mobilenet_v1_coco.pb", "ssd_mobilenet_v1_coco.pbtxt", "",
            Mat(cv::Size(300, 300), CV_32FC3));
@ -158,7 +169,9 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_TensorFlow)

 PERF_TEST_P_(DNNTestNetwork, DenseNet_121)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", "",
               Mat(cv::Size(224, 224), CV_32FC3));
 }
@ -189,7 +202,7 @@ PERF_TEST_P_(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
 PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)
 {
    if (backend == DNN_BACKEND_HALIDE ||
-        backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL)
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
        throw SkipTestException("");
    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt", "",
               Mat(cv::Size(300, 300), CV_32FC3));
@ -197,7 +210,9 @@ PERF_TEST_P_(DNNTestNetwork, opencv_face_detector)

 PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
            Mat(cv::Size(300, 300), CV_32FC3));
 }
@ -209,6 +224,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
 #endif
 #ifdef HAVE_INF_ENGINE
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
 #endif
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_CPU),
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
--- a/modules/dnn/src/caffe/opencv-caffe.proto
+++ b/modules/dnn/src/caffe/opencv-caffe.proto
@ -50,7 +50,7 @@ syntax = "proto2";

 package opencv_caffe;

-// NVidia's Caffe feature is used to store fp16 weights, https://github.com/NVIDIA/caffe:
+// NVIDIA's Caffe feature is used to store fp16 weights, https://github.com/NVIDIA/caffe:
 // Math and storage types
 enum Type {
  DOUBLE = 0;
@ -72,10 +72,10 @@ message BlobProto {
  repeated double double_data = 8 [packed = true];
  repeated double double_diff = 9 [packed = true];

-  // NVidia's Caffe fields begin.
+  // NVIDIA's Caffe fields begin.
  optional Type raw_data_type = 10;
  optional bytes raw_data = 12 [packed = false];
-  // NVidia's Caffe fields end.
+  // NVIDIA's Caffe fields end.

  // 4D dimensions -- deprecated.  Use "shape" instead.
  optional int32 num = 1 [default = 0];
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1154,7 +1154,7 @@ struct Net::Impl
                ld.skip = true;
            }
            layers[lastLayerId].skip = false;
-            ieNode->net->init();
+            ieNode->net->init(preferableTarget);
            return;
        }

@ -1167,17 +1167,17 @@ struct Net::Impl
        for (it = layers.begin(); it != layers.end(); ++it)
        {
            LayerData &ld = it->second;
-            ld.skip = true;  // Initially skip all Inference Engine supported layers.
-            Ptr<Layer> layer = ld.layerInstance;
+            bool fused = ld.skip && ld.id != 0;

+            Ptr<Layer> layer = ld.layerInstance;
            if (!layer->supportBackend(preferableBackend))
            {
                addInfEngineNetOutputs(ld);
-                ld.skip = false;
                net = Ptr<InfEngineBackendNet>();
                netBlobsWrappers.clear();
                continue;
            }
+            ld.skip = true;  // Initially skip all Inference Engine supported layers.

            // Create a new network if one of inputs from different Inference Engine graph.
            for (int i = 0; i < ld.inputBlobsId.size(); ++i)
@ -1217,19 +1217,16 @@ struct Net::Impl
            }
            netBlobsWrappers[ld.id] = ld.outputBlobsWrappers[0];

-            bool fused = false;
            Ptr<BackendNode> node;
            if (!net.empty())
            {
-                // Try to fuse.
-                bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
-                               ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
-                if (inPlace)
+                if (fused)
                {
-                    node = layer->tryAttach(layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend]);
-                    fused = !node.empty();
-                    if (fused)
-                        ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
+                    bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
+                                   ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
+                    CV_Assert(inPlace);
+                    node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
+                    ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
                }
            }
            else
@ -1247,6 +1244,19 @@ struct Net::Impl
            CV_Assert(!ieNode.empty());
            ieNode->net = net;

+            if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !fused)
+            {
+                ieNode->layer->precision = InferenceEngine::Precision::FP16;
+                auto weightableLayer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(ieNode->layer);
+                if (weightableLayer)
+                {
+                    if (weightableLayer->_weights)
+                        weightableLayer->_weights = convertFp16(weightableLayer->_weights);
+                    if (weightableLayer->_biases)
+                        weightableLayer->_biases = convertFp16(weightableLayer->_biases);
+                }
+            }
+
            ieNode->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers);
            net->addBlobs(ld.inputBlobsWrappers);
            net->addBlobs(ld.outputBlobsWrappers);
@ -1276,7 +1286,7 @@ struct Net::Impl

            if (!ieNode->net->isInitialized())
            {
-                ieNode->net->init();
+                ieNode->net->init(preferableTarget);
                ld.skip = false;
            }
        }
@ -1380,7 +1390,8 @@ struct Net::Impl

    void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
    {
-        if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT)
+        if( !fusion || preferableBackend != DNN_BACKEND_DEFAULT &&
+                       preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
            return;

        CV_TRACE_FUNCTION();
@ -1407,7 +1418,7 @@ struct Net::Impl
            // some other layers.

            // TODO: OpenCL target support more fusion styles.
-            if ( preferableTarget == DNN_TARGET_OPENCL &&
+            if ( preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_OPENCL &&
                 (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
                 ld.layerInstance->type != "MVN")) )
                continue;
@ -1442,6 +1453,9 @@ struct Net::Impl
                        break;
                }

+                if (preferableBackend != DNN_BACKEND_DEFAULT)
+                    continue;  // Go to the next layer.
+
                // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
                if ( preferableTarget != DNN_TARGET_OPENCL ||
                        (preferableTarget == DNN_TARGET_OPENCL &&
@ -1583,6 +1597,9 @@ struct Net::Impl
                }
            }

+            if (preferableBackend != DNN_BACKEND_DEFAULT)
+                continue;  // Go to the next layer.
+
            // the optimization #2. if there is no layer that takes max pooling layer's computed
            // max indices (and only some semantical segmentation networks might need this;
            // many others only take the maximum values), then we switch the max pooling
@ -1944,7 +1961,8 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
        ld.layerInstance = Ptr<Layer>(new InfEngineBackendLayer(it.second));
        ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;

-        cvNet.connect(0, 0, lid, 0);
+        for (int i = 0; i < inputsNames.size(); ++i)
+            cvNet.connect(0, i, lid, i);
    }
    cvNet.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);

--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -234,19 +234,6 @@ public:
 #endif  // HAVE_HALIDE
                break;
            }
-            case DNN_BACKEND_INFERENCE_ENGINE:
-            {
-#ifdef HAVE_INF_ENGINE
-                auto base = node.dynamicCast<InfEngineBackendNode>();
-                auto conv = std::dynamic_pointer_cast<InferenceEngine::ConvolutionLayer>(base->layer);
-                if (conv)
-                {
-                    fuseConvWeights(conv, weights_, bias_);
-                    return base;
-                }
-#endif  // HAVE_INF_ENGINE
-                break;
-            }
        }
        return Ptr<BackendNode>();
    }
@ -287,8 +274,9 @@ public:
        lp.precision = InferenceEngine::Precision::FP32;
        std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));

-        ieLayer->_weights = wrapToInfEngineBlob(weights_);
-        ieLayer->_biases = wrapToInfEngineBlob(bias_);
+        const int numChannels = weights_.total();
+        ieLayer->_weights = wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C);
+        ieLayer->_biases = wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C);

        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@ -40,6 +40,7 @@
 //
 //M*/
 #include "../precomp.hpp"
+#include "../op_inf_engine.hpp"

 namespace cv
 {
@ -53,6 +54,12 @@ public:
        setParamsFrom(params);
    }

+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_DEFAULT ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine();
+    }
+
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
@ -104,6 +111,19 @@ public:
            if (outputs[i].data != inputs[i]->data)
                inputs[i]->copyTo(outputs[i]);
    }
+
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "Split";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::SplitLayer> ieLayer(new InferenceEngine::SplitLayer(lp));
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
 };

 Ptr<Layer> BlankLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -173,21 +173,21 @@ public:
    std::vector<float> biasvec;
    std::vector<float> reluslope;
    Ptr<ActivationLayer> activ;
+    bool newWeightAndBias;
+    bool fusedBias;

 #ifdef HAVE_OPENCL
    Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
    std::vector<UMat> umat_blobs;
-    bool fusedBias;
-    bool newWeightAndBias;
    bool newActiv;
    ocl4dnnFusedActiv_t activType;
    float power;
 #endif
    ConvolutionLayerImpl(const LayerParams &params) : BaseConvolutionLayerImpl(params)
    {
-#ifdef HAVE_OPENCL
-        fusedBias = false;
        newWeightAndBias = false;
+        fusedBias = false;
+#ifdef HAVE_OPENCL
        newActiv = false;
        activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
        power = 0.f;
@ -350,10 +350,8 @@ public:
                biasvec[i] += b.at<float>(i);
        }

-#ifdef HAVE_OPENCL
        newWeightAndBias = !w.empty() || !b.empty();
        fusedBias = hasBias() || !b.empty();
-#endif
        biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
    }

@ -433,9 +431,31 @@ public:
        ieLayer->_dilation_y = dilation.height;
        ieLayer->_group = group;

-        ieLayer->_weights = wrapToInfEngineBlob(blobs[0]);
-        if (hasBias())
-            ieLayer->_biases = wrapToInfEngineBlob(blobs[1]);
+        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW);
+        if (newWeightAndBias)
+        {
+            if (weightsMat.isContinuous())
+            {
+                Mat fusedWeights = weightsMat.reshape(1, blobs[0].dims, blobs[0].size);
+                ieLayer->_weights = wrapToInfEngineBlob(fusedWeights, InferenceEngine::Layout::OIHW);
+            }
+            else
+            {
+                ieLayer->_weights = InferenceEngine::make_shared_blob<float>(
+                                    InferenceEngine::Precision::FP32, InferenceEngine::Layout::OIHW,
+                                    ieLayer->_weights->dims());
+                ieLayer->_weights->allocate();
+
+                Mat newWeights = infEngineBlobToMat(ieLayer->_weights).reshape(1, outCn);
+                Mat fusedWeights = weightsMat.colRange(0, newWeights.cols);
+                fusedWeights.copyTo(newWeights);
+            }
+        }
+        if (hasBias() || fusedBias)
+        {
+            Mat biasesMat({outCn}, CV_32F, &biasvec[0]);
+            ieLayer->_biases = wrapToInfEngineBlob(biasesMat, {outCn}, InferenceEngine::Layout::C);
+        }
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -412,9 +412,9 @@ public:
        std::shared_ptr<InferenceEngine::FullyConnectedLayer> ieLayer(new InferenceEngine::FullyConnectedLayer(lp));

        ieLayer->_out_num = blobs[0].size[0];
-        ieLayer->_weights = wrapToInfEngineBlob(blobs[0]);
+        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], {blobs[0].size[0], blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW);
        if (blobs.size() > 1)
-            ieLayer->_biases = wrapToInfEngineBlob(blobs[1]);
+            ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {ieLayer->_out_num}, InferenceEngine::Layout::C);
        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
        return Ptr<BackendNode>();
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@ -77,7 +77,20 @@ bool getParameter(const LayerParams &params, const std::string& nameBase, const
    {
        if (params.has(nameAll_))
        {
-            parameterH = parameterW = params.get<int>(nameAll_);
+            DictValue param = params.get(nameAll_);
+            parameterH = param.get<int>(0);
+            if (param.size() == 1)
+            {
+                parameterW = parameterH;
+            }
+            else if (param.size() == 2)
+            {
+                parameterW = param.get<int>(1);
+            }
+            else
+            {
+                return false;
+            }
            return true;
        }
        else
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@ -42,6 +42,7 @@

 #include "../precomp.hpp"
 #include "layers_common.hpp"
+#include "../op_inf_engine.hpp"

 namespace cv { namespace dnn {

@ -54,9 +55,19 @@ public:
        pnorm = params.get<float>("p", 2);
        epsilon = params.get<float>("eps", 1e-10f);
        acrossSpatial = params.get<bool>("across_spatial", true);
+        startAxis = params.get<int>("start_axis", 1);
+        CV_Assert(!params.has("across_spatial") || !params.has("end_axis"));
+        endAxis = params.get<int>("end_axis", acrossSpatial ? -1 : startAxis);
        CV_Assert(pnorm > 0);
    }

+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_DEFAULT ||
+               backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() &&
+               pnorm == 2 && !blobs.empty();
+    }
+
    bool getMemoryShapes(const std::vector<MatShape> &inputs,
                         const int requiredOutputs,
                         std::vector<MatShape> &outputs,
@ -85,20 +96,26 @@ public:

        const UMat& inp0 = inputs[0];
        UMat& buffer = internals[0];
-        size_t num = inp0.size[0];
-        size_t channels = inp0.size[1];
-        size_t channelSize = inp0.total() / (num * channels);
+        startAxis = clamp(startAxis, inp0.dims);
+        endAxis = clamp(endAxis, inp0.dims);
+
+        size_t num = total(shape(inp0.size), 0, startAxis);
+        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        size_t planeSize = inp0.total() / (num * numPlanes);
+        MatShape s = shape(1, inputs[0].total());
+        UMat inp = inputs[0].reshape(1, s.size(), &s[0]).reshape(1, num);
+        UMat out = outputs[0].reshape(1, s.size(), &s[0]).reshape(1, num);
        for (size_t i = 0; i < num; ++i)
        {
-            MatShape s = shape(channels, channelSize);
-            UMat src = inputs[i].reshape(1, s.size(), &s[0]);
-            UMat dst = outputs[i].reshape(1, s.size(), &s[0]);
+            s = shape(numPlanes, planeSize);
+            UMat src = inp.row(i).reshape(1, s.size(), &s[0]);
+            UMat dst = out.row(i).reshape(1, s.size(), &s[0]);

            UMat abs_mat;
            absdiff(src, cv::Scalar::all(0), abs_mat);
            pow(abs_mat, pnorm, buffer);

-            if (acrossSpatial)
+            if (planeSize == 1)
            {
                // add eps to avoid overflow
                float absSum = sum(buffer)[0] + epsilon;
@ -114,7 +131,7 @@ public:
                // compute inverted norm to call multiply instead divide
                cv::pow(norm, -1.0f / pnorm, norm);

-                repeat(norm, channels, 1, buffer);
+                repeat(norm, numPlanes, 1, buffer);
                multiply(src, buffer, dst);
            }

@ -130,7 +147,7 @@ public:
                else
                {
                    // _scale: _channels x 1
-                    CV_Assert(scale.total() == channels);
+                    CV_Assert(scale.total() == numPlanes);
                    repeat(scale, 1, dst.cols, buffer);
                    multiply(dst, buffer, dst);
                }
@ -162,17 +179,22 @@ public:

        const Mat& inp0 = *inputs[0];
        Mat& buffer = internals[0];
-        size_t num = inp0.size[0];
-        size_t channels = inp0.size[1];
-        size_t channelSize = inp0.total() / (num * channels);
+        startAxis = clamp(startAxis, inp0.dims);
+        endAxis = clamp(endAxis, inp0.dims);
+
+        const float* inpData = inp0.ptr<float>();
+        float* outData = outputs[0].ptr<float>();
+
+        size_t num = total(shape(inp0.size), 0, startAxis);
+        size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1);
+        size_t planeSize = inp0.total() / (num * numPlanes);
        for (size_t n = 0; n < num; ++n)
        {
-            Mat src = Mat(channels, channelSize, CV_32F, (void*)inp0.ptr<float>(n));
-            Mat dst = Mat(channels, channelSize, CV_32F, (void*)outputs[0].ptr<float>(n));
-
+            Mat src = Mat(numPlanes, planeSize, CV_32F, (void*)inpData);
+            Mat dst = Mat(numPlanes, planeSize, CV_32F, (void*)outData);
            cv::pow(abs(src), pnorm, buffer);

-            if (acrossSpatial)
+            if (planeSize == 1)
            {
                // add eps to avoid overflow
                float absSum = sum(buffer)[0] + epsilon;
@ -188,7 +210,7 @@ public:
                // compute inverted norm to call multiply instead divide
                cv::pow(norm, -1.0f / pnorm, norm);

-                repeat(norm, channels, 1, buffer);
+                repeat(norm, numPlanes, 1, buffer);
                multiply(src, buffer, dst);
            }

@ -204,13 +226,40 @@ public:
                else
                {
                    // _scale: _channels x 1
-                    CV_Assert(scale.total() == channels);
+                    CV_Assert(scale.total() == numPlanes);
                    repeat(scale, 1, dst.cols, buffer);
                    multiply(dst, buffer, dst);
                }
            }
+            inpData += numPlanes * planeSize;
+            outData += numPlanes * planeSize;
        }
    }
+
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
+    {
+#ifdef HAVE_INF_ENGINE
+        InferenceEngine::LayerParams lp;
+        lp.name = name;
+        lp.type = "Normalize";
+        lp.precision = InferenceEngine::Precision::FP32;
+        std::shared_ptr<InferenceEngine::CNNLayer> ieLayer(new InferenceEngine::CNNLayer(lp));
+
+        CV_Assert(!blobs.empty());
+
+        ieLayer->params["eps"] = format("%f", epsilon);
+        ieLayer->params["across_spatial"] = acrossSpatial ? "1" : "0";
+        ieLayer->params["channel_shared"] = blobs[0].total() == 1 ? "1" : "0";
+
+        const int numChannels = blobs[0].total();
+        ieLayer->blobs["weights"] = wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C);
+        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
+#endif  // HAVE_INF_ENGINE
+        return Ptr<BackendNode>();
+    }
+
+private:
+    int startAxis, endAxis;
 };


--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@ -132,20 +132,6 @@ public:
 #endif  // HAVE_HALIDE
                break;
            }
-            case DNN_BACKEND_INFERENCE_ENGINE:
-            {
-#ifdef HAVE_INF_ENGINE
-                auto base = node.dynamicCast<InfEngineBackendNode>();
-                auto conv = std::dynamic_pointer_cast<InferenceEngine::ConvolutionLayer>(base->layer);
-                if (conv)
-                {
-                    Mat bias = hasBias ? blobs[1] : Mat();
-                    fuseConvWeights(conv, blobs[0], bias);
-                    return base;
-                }
-#endif  // HAVE_INF_ENGINE
-                break;
-            }
        }
        return Ptr<BackendNode>();
    }
@ -192,9 +178,10 @@ public:
        lp.precision = InferenceEngine::Precision::FP32;
        std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));

-        ieLayer->_weights = wrapToInfEngineBlob(blobs[0]);
+        const int numChannels = blobs[0].total();
+        ieLayer->_weights = wrapToInfEngineBlob(blobs[0], {numChannels}, InferenceEngine::Layout::C);
        if (hasBias)
-            ieLayer->_biases = wrapToInfEngineBlob(blobs[1]);
+            ieLayer->_biases = wrapToInfEngineBlob(blobs[1], {numChannels}, InferenceEngine::Layout::C);

        return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
--- a/modules/dnn/src/layers/shift_layer.cpp
+++ b/modules/dnn/src/layers/shift_layer.cpp
@ -90,27 +90,6 @@ public:
        }
    }

-    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
-    {
-        switch (node->backendId)
-        {
-            case DNN_BACKEND_INFERENCE_ENGINE:
-            {
-#ifdef HAVE_INF_ENGINE
-                auto base = node.dynamicCast<InfEngineBackendNode>();
-                auto conv = std::dynamic_pointer_cast<InferenceEngine::ConvolutionLayer>(base->layer);
-                if (conv)
-                {
-                    fuseConvWeights(conv, Mat(), blobs[0]);
-                    return base;
-                }
-#endif  // HAVE_INF_ENGINE
-                break;
-            }
-        }
-        return Ptr<BackendNode>();
-    }
-
    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
    {
 #ifdef HAVE_INF_ENGINE
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -18,6 +18,11 @@ namespace cv { namespace dnn {

 #ifdef HAVE_INF_ENGINE

+static int infEngineVersion()
+{
+    return std::atoi(InferenceEngine::GetInferenceEngineVersion()->buildNumber);
+}
+
 InfEngineBackendNode::InfEngineBackendNode(const InferenceEngine::CNNLayerPtr& _layer)
    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE), layer(_layer) {}

@ -58,23 +63,37 @@ static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std:
 {
    std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
    std::reverse(reversedShape.begin(), reversedShape.end());
-    return InferenceEngine::DataPtr(
-      new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32,
-                                InferenceEngine::Layout::ANY)
-    );
+    if (infEngineVersion() > 5855)
+    {
+        InferenceEngine::Layout l = InferenceEngine::Layout::ANY;
+        if (m.dims == 4)
+            l = InferenceEngine::Layout::NCHW;
+        else if (m.dims == 2)
+            l = InferenceEngine::Layout::NC;
+        return InferenceEngine::DataPtr(
+            new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, l)
+        );
+    }
+    else
+    {
+        return InferenceEngine::DataPtr(
+            new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32)
+        );
+    }
 }

-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape)
+InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape,
+                                                       InferenceEngine::Layout layout)
 {
    return InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
-                                                    shape, (float*)m.data);
+                                                    layout, shape, (float*)m.data);
 }

-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m)
+InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout)
 {
    std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
    std::reverse(reversedShape.begin(), reversedShape.end());
-    return wrapToInfEngineBlob(m, reversedShape);
+    return wrapToInfEngineBlob(m, reversedShape, layout);
 }

 InferenceEngine::DataPtr infEngineDataNode(const Ptr<BackendWrapper>& ptr)
@ -109,10 +128,14 @@ void InfEngineBackendWrapper::setHostDirty()

 InfEngineBackendNet::InfEngineBackendNet()
 {
+    targetDevice = InferenceEngine::TargetDevice::eCPU;
+    precision = InferenceEngine::Precision::FP32;
 }

 InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net)
 {
+    targetDevice = InferenceEngine::TargetDevice::eCPU;
+    precision = InferenceEngine::Precision::FP32;
    inputs = net.getInputsInfo();
    outputs = net.getOutputsInfo();
    layers.resize(net.layerCount());  // A hack to execute InfEngineBackendNet::layerCount correctly.
@ -126,9 +149,14 @@ void InfEngineBackendNet::Release() noexcept
    outputs.clear();
 }

+void InfEngineBackendNet::setPrecision(InferenceEngine::Precision p) noexcept
+{
+    precision = p;
+}
+
 InferenceEngine::Precision InfEngineBackendNet::getPrecision() noexcept
 {
-    return InferenceEngine::Precision::FP32;
+    return precision;
 }

 // Assume that outputs of network is unconnected blobs.
@ -161,9 +189,8 @@ InferenceEngine::InputInfo::Ptr InfEngineBackendNet::getInput(const std::string
    return it->second;
 }

-void InfEngineBackendNet::getName(char *pName, size_t len) noexcept
+void InfEngineBackendNet::getName(char*, size_t) noexcept
 {
-    CV_Error(Error::StsNotImplemented, "");
 }

 size_t InfEngineBackendNet::layerCount() noexcept
@ -213,13 +240,15 @@ InfEngineBackendNet::getLayerByName(const char *layerName, InferenceEngine::CNNL

 void InfEngineBackendNet::setTargetDevice(InferenceEngine::TargetDevice device) noexcept
 {
-    if (device != InferenceEngine::TargetDevice::eCPU)
+    if (device != InferenceEngine::TargetDevice::eCPU &&
+        device != InferenceEngine::TargetDevice::eGPU)
        CV_Error(Error::StsNotImplemented, "");
+    targetDevice = device;
 }

 InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() noexcept
 {
-    return InferenceEngine::TargetDevice::eCPU;
+    return targetDevice;
 }

 InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t size) noexcept
@ -234,7 +263,7 @@ size_t InfEngineBackendNet::getBatchSize() const noexcept
    return 0;
 }

-void InfEngineBackendNet::init()
+void InfEngineBackendNet::init(int targetId)
 {
    if (inputs.empty())
    {
@ -307,6 +336,15 @@ void InfEngineBackendNet::init()
        outBlobs[it.first] = allBlobs[it.first];
    }

+    switch (targetId)
+    {
+    case DNN_TARGET_CPU: setTargetDevice(InferenceEngine::TargetDevice::eCPU); break;
+    case DNN_TARGET_OPENCL_FP16: setPrecision(InferenceEngine::Precision::FP16);  // Fallback to the next.
+    case DNN_TARGET_OPENCL: setTargetDevice(InferenceEngine::TargetDevice::eGPU); break;
+    default:
+        CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId));
+    }
+
    if (!isInitialized())
        initPlugin(*this);
 }
@ -317,10 +355,9 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)

    InferenceEngine::StatusCode status;
    InferenceEngine::ResponseDesc resp;
-    const InferenceEngine::Version* v = InferenceEngine::GetInferenceEngineVersion();

-    plugin = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(InferenceEngine::TargetDevice::eCPU);
-    if (std::atoi(v->buildNumber) > 5855)
+    plugin = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(targetDevice);
+    if (infEngineVersion() > 5855 && targetDevice == InferenceEngine::TargetDevice::eCPU)
    {
 #ifdef _WIN32
        InferenceEngine::IExtensionPtr extension =
@ -360,7 +397,7 @@ void InfEngineBackendNet::forward()
        CV_Error(Error::StsAssert, resp.msg);
 }

-static inline Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
+Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
 {
    // NOTE: Inference Engine sizes are reversed.
    std::vector<size_t> dims = blob->dims();
@ -369,56 +406,6 @@ static inline Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
    return Mat(size, CV_32F, (void*)blob->buffer());
 }

-void fuseConvWeights(const std::shared_ptr<InferenceEngine::ConvolutionLayer>& conv,
-                     const Mat& w, const Mat& b)
-{
-    CV_Assert(!w.empty() || !b.empty());
-    if (!w.empty())
-    {
-        // Get convolution's weights. Clone the data because Inference Engine can host it
-        // and conv->_weights->allocate() below will deallocate it.
-        Mat originWeights = infEngineBlobToMat(conv->_weights).clone();
-
-        // Create new weights blob.
-        conv->_weights = InferenceEngine::make_shared_blob<float>(
-                            InferenceEngine::Precision::FP32, conv->_weights->dims());
-        conv->_weights->allocate();
-
-        // Convolution weights have OIHW data layout.
-        // (conv(I) + b1 ) * w + b2
-        // w*conv(I) + b1 * w + b2
-        Mat fusedWeights = infEngineBlobToMat(conv->_weights);
-
-        const int numChannels = fusedWeights.size[0];
-        // Mat weights = blobs[0].reshape(1, 1);
-        // Mat bias = hasBias ? blobs[1].reshape(1, 1) : Mat();
-        CV_Assert(numChannels == w.total());
-        CV_Assert(b.empty() || numChannels == b.total());
-        for (int i = 0; i < numChannels; ++i)
-        {
-            cv::multiply(slice(originWeights, i), w.at<float>(i), slice(fusedWeights, i));
-        }
-    }
-    if (conv->_biases)
-    {
-        // The same for biases.
-        Mat originBiases = infEngineBlobToMat(conv->_biases).clone();
-
-        conv->_biases = InferenceEngine::make_shared_blob<float>(
-                            InferenceEngine::Precision::FP32, conv->_biases->dims());
-        conv->_biases->allocate();
-        Mat fusedBiases = infEngineBlobToMat(conv->_biases);
-        originBiases.copyTo(fusedBiases);
-
-        if (!w.empty())
-            cv::multiply(w.reshape(1, fusedBiases.dims, &fusedBiases.size[0]), fusedBiases, fusedBiases);
-        if (!b.empty())
-            cv::add(fusedBiases, b.reshape(1, fusedBiases.dims, &fusedBiases.size[0]), fusedBiases);
-    }
-    else
-        conv->_biases = wrapToInfEngineBlob(b);
-}
-
 InfEngineBackendLayer::InfEngineBackendLayer(const InferenceEngine::DataPtr& output_)
 {
    output = output_;
@ -454,6 +441,16 @@ void InfEngineBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArra
    CV_Error(Error::StsInternal, "Choose Inference Engine as a preferable backend.");
 }

+InferenceEngine::TBlob<int16_t>::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob)
+{
+    auto halfs = InferenceEngine::make_shared_blob<int16_t>(InferenceEngine::Precision::FP16, blob->layout(), blob->dims());
+    halfs->allocate();
+    Mat floatsData(1, blob->size(), CV_32F, blob->buffer());
+    Mat halfsData(1, blob->size(), CV_16SC1, halfs->buffer());
+    convertFp16(floatsData, halfsData);
+    return halfs;
+}
+
 #endif  // HAVE_INF_ENGINE

 bool haveInfEngine()
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@ -32,6 +32,8 @@ public:

    virtual void Release() noexcept CV_OVERRIDE;

+    void setPrecision(InferenceEngine::Precision p) noexcept;
+
    virtual InferenceEngine::Precision getPrecision() noexcept CV_OVERRIDE;

    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) noexcept /*CV_OVERRIDE*/;
@ -68,7 +70,7 @@ public:

    virtual size_t getBatchSize() const noexcept CV_OVERRIDE;

-    void init();
+    void init(int targetId);

    void addBlobs(const std::vector<Ptr<BackendWrapper> >& wrappers);

@ -83,6 +85,8 @@ private:
    InferenceEngine::BlobMap inpBlobs;
    InferenceEngine::BlobMap outBlobs;
    InferenceEngine::BlobMap allBlobs;
+    InferenceEngine::TargetDevice targetDevice;
+    InferenceEngine::Precision precision;
    InferenceEngine::InferenceEnginePluginPtr plugin;

    void initPlugin(InferenceEngine::ICNNNetwork& net);
@ -116,15 +120,17 @@ public:
    InferenceEngine::TBlob<float>::Ptr blob;
 };

-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m);
+InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY);

-InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape);
+InferenceEngine::TBlob<float>::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<size_t>& shape, InferenceEngine::Layout layout);

 InferenceEngine::DataPtr infEngineDataNode(const Ptr<BackendWrapper>& ptr);

-// Fuses convolution weights and biases with channel-wise scales and shifts.
-void fuseConvWeights(const std::shared_ptr<InferenceEngine::ConvolutionLayer>& conv,
-                     const Mat& w, const Mat& b = Mat());
+Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob);
+
+// Convert Inference Engine blob with FP32 precision to FP16 precision.
+// Allocates memory for a new blob.
+InferenceEngine::TBlob<int16_t>::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob);

 // This is a fake class to run networks from Model Optimizer. Objects of that
 // class simulate responses of layers are imported by OpenCV and supported by
@ -151,7 +157,6 @@ private:
    InferenceEngine::DataPtr output;
 };

-
 #endif  // HAVE_INF_ENGINE

 bool haveInfEngine();
--- a/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
+++ b/modules/dnn/src/tensorflow/tf_graph_simplifier.cpp
@ -80,14 +80,16 @@ public:
    {
        CV_Assert(inpId < node.input_size());
        std::string name = node.input(inpId);
+        // If operation produces several tensors, they are specified by index
+        // after ':' character. In example, "input:0".
+        name = name.substr(0, name.rfind(':'));
        const int numNodes = net.node_size();
        for (int i = 0; i < numNodes; ++i)
        {
            if (net.node(i).name() == name)
                return net.node(i);
        }
-        CV_Error(Error::StsParseError, "Input node with name " + name + " not found");
-        return net.node(0);  // just return something
+        CV_ErrorNoReturn(Error::StsParseError, "Input node with name " + name + " not found");
    }

    // Match TensorFlow subgraph starting from <nodeId> with a set of nodes to be fused.
@ -400,6 +402,23 @@ private:
    int numOutDims;
 };

+class L2NormalizeSubgraph : public Subgraph
+{
+public:
+    L2NormalizeSubgraph()
+    {
+        int input = addNodeToMatch("");
+        int square = addNodeToMatch("Square", input);
+        int reductionIndices = addNodeToMatch("Const");
+        int sum = addNodeToMatch("Sum", square, reductionIndices);
+        int y = addNodeToMatch("Const");
+        int maximum = addNodeToMatch("Maximum", sum, y);
+        int rsqrt = addNodeToMatch("Rsqrt", maximum);
+        addNodeToMatch("Mul", input, rsqrt);
+        setFusedNode("L2Normalize", input, reductionIndices);
+    }
+};
+
 void simplifySubgraphs(tensorflow::GraphDef& net)
 {
    std::vector<Ptr<Subgraph> > subgraphs;
@ -410,6 +429,7 @@ void simplifySubgraphs(tensorflow::GraphDef& net)
    subgraphs.push_back(Ptr<Subgraph>(new SoftMaxKerasSubgraph()));
    subgraphs.push_back(Ptr<Subgraph>(new ReLU6KerasSubgraph()));
    subgraphs.push_back(Ptr<Subgraph>(new ReshapeKerasSubgraph(3)));
+    subgraphs.push_back(Ptr<Subgraph>(new L2NormalizeSubgraph()));

    int numNodes = net.node_size();
    std::vector<int> matchedNodesIds;
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@ -37,7 +37,13 @@ using ::google::protobuf::Reflection;
 namespace
 {

-static int toNCHW[] = {0, 2, 3, 1};
+static int toNCHW(int idx)
+{
+    CV_Assert(-4 <= idx && idx < 4);
+    if (idx == 0) return 0;
+    else if (idx > 0) return idx % 3 + 1;
+    else return (4 + idx) % 3 + 1;
+}

 // This values are used to indicate layer output's data layout where it's possible.
 enum DataLayout
@ -556,11 +562,23 @@ static void addConstNodes(tensorflow::GraphDef& net, std::map<String, int>& cons
 // this layer's output has this data layout too. Returns DATA_LAYOUT_UNKNOWN otherwise.
 static int predictOutputDataLayout(const tensorflow::NodeDef& layer, const std::map<String, int>& data_layouts)
 {
+    if (hasLayerAttr(layer, "data_format"))
+    {
+        std::string format = getLayerAttr(layer, "data_format").s();
+        if (format == "NHWC" || format == "channels_last")
+            return DATA_LAYOUT_NHWC;
+        else if (format == "NCHW" || format == "channels_first")
+            return DATA_LAYOUT_NCHW;
+        else
+            CV_Error(Error::StsParseError, "Unknown data_format value: " + format);
+    }
+
+    // Determine layout by layer's inputs
    int layout = DATA_LAYOUT_UNKNOWN;
    std::map<String, int>::const_iterator it;
    for (int i = 0, n = layer.input_size(); i < n; ++i)
    {
-        it = data_layouts.find(layer.input(i));
+        it = data_layouts.find(layer.input(i).substr(0, layer.input(i).rfind(':')));
        if (it != data_layouts.end())
        {
            if (it->second == DATA_LAYOUT_UNKNOWN)
@ -708,17 +726,7 @@ void TFImporter::populateNet(Net dstNet)
            // one input only
            connect(layer_id, dstNet, parsePin(input), id, 0);

-            if (hasLayerAttr(layer, "data_format"))
-            {
-                std::string format = getLayerAttr(layer, "data_format").s();
-                if (format == "NHWC" || format == "channels_last")
-                    data_layouts[name] = DATA_LAYOUT_NHWC;
-                else if (format == "NCHW" || format == "channels_first")
-                    data_layouts[name] = DATA_LAYOUT_NCHW;
-                else
-                    CV_Error(Error::StsParseError, "Unknown data_format value: " + format);
-            }
-            else
+            if (data_layouts[name] == DATA_LAYOUT_UNKNOWN)
                data_layouts[name] = DATA_LAYOUT_NHWC;
        }
        else if (type == "BiasAdd" || type == "Add")
@ -956,7 +964,7 @@ void TFImporter::populateNet(Net dstNet)
        {
            int axisId = (type == "Concat" ? 0 : layer.input_size() - 1);
            int axis = getConstBlob(layer, value_id, axisId).int_val().Get(0);
-            layerParams.set("axis", 0 <= axis && axis < 4 ? toNCHW[axis] : axis);
+            layerParams.set("axis", 0 <= axis && axis < 4 ? toNCHW(axis) : axis);

            int id = dstNet.addLayer(name, "Concat", layerParams);
            layer_id[name] = id;
@ -1017,7 +1025,7 @@ void TFImporter::populateNet(Net dstNet)
            // num_split
            // 1st blob is dims tensor
            int axis = getConstBlob(layer, value_id, 0).int_val().Get(0);
-            layerParams.set("axis", toNCHW[axis]);
+            layerParams.set("axis", toNCHW(axis));

            int id = dstNet.addLayer(name, "Slice", layerParams);
            layer_id[name] = id;
@ -1410,9 +1418,26 @@ void TFImporter::populateNet(Net dstNet)
        {
            // op: "L2Normalize"
            // input: "input"
-            CV_Assert(layer.input_size() == 1);
-            layerParams.set("across_spatial", false);
-            layerParams.set("channel_shared", false);
+            // input: "reduction_indices" (axis)
+            CV_Assert(layer.input_size() == 2);
+            Mat reductionIndices = getTensorContent(getConstBlob(layer, value_id, 1));
+            CV_Assert(reductionIndices.type() == CV_32SC1);
+
+            const int numAxes = reductionIndices.total();
+            if (data_layouts[name] == DATA_LAYOUT_NHWC)
+                for (int i = 0; i < numAxes; ++i)
+                    reductionIndices.at<int>(i) = toNCHW(reductionIndices.at<int>(i));
+
+            cv::sort(reductionIndices, reductionIndices, SORT_ASCENDING);
+            for (int i = 1; i < numAxes; ++i)
+            {
+                CV_Assert(reductionIndices.at<int>(i) == reductionIndices.at<int>(i - 1) + 1);
+                // Axes have the same sign.
+                CV_Assert(reductionIndices.at<int>(i) * reductionIndices.at<int>(i - 1) >= 0);
+            }
+            layerParams.set("start_axis", reductionIndices.at<int>(0));
+            layerParams.set("end_axis", reductionIndices.at<int>(numAxes - 1));
+
            int id = dstNet.addLayer(name, "Normalize", layerParams);
            layer_id[name] = id;
            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -100,6 +100,8 @@ public:

 TEST_P(DNNTestNetwork, AlexNet)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
               Size(227, 227), "prob",
               target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_alexnet.yml" :
@ -108,6 +110,8 @@ TEST_P(DNNTestNetwork, AlexNet)

 TEST_P(DNNTestNetwork, ResNet_50)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
    processNet("dnn/ResNet-50-model.caffemodel", "dnn/ResNet-50-deploy.prototxt",
               Size(224, 224), "prob",
               target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_resnet_50.yml" :
@ -116,6 +120,8 @@ TEST_P(DNNTestNetwork, ResNet_50)

 TEST_P(DNNTestNetwork, SqueezeNet_v1_1)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
    processNet("dnn/squeezenet_v1.1.caffemodel", "dnn/squeezenet_v1.1.prototxt",
               Size(227, 227), "prob",
               target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_squeezenet_v1_1.yml" :
@ -124,6 +130,8 @@ TEST_P(DNNTestNetwork, SqueezeNet_v1_1)

 TEST_P(DNNTestNetwork, GoogLeNet)
 {
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
    processNet("dnn/bvlc_googlenet.caffemodel", "dnn/bvlc_googlenet.prototxt",
               Size(224, 224), "prob");
 }
@ -147,7 +155,9 @@ TEST_P(DNNTestNetwork, ENet)

 TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);

@ -157,7 +167,9 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)

 TEST_P(DNNTestNetwork, MobileNet_SSD_TensorFlow)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
    processNet("dnn/ssd_mobilenet_v1_coco.pb", "dnn/ssd_mobilenet_v1_coco.pbtxt",
@ -177,35 +189,45 @@ TEST_P(DNNTestNetwork, SSD_VGG16)
 TEST_P(DNNTestNetwork, OpenPose_pose_coco)
 {
    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 3e-5 : 1e-5;
+    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 3e-3 : 1e-4;
    processNet("dnn/openpose_pose_coco.caffemodel", "dnn/openpose_pose_coco.prototxt",
-               Size(368, 368), "");
+               Size(368, 368), "", "", l1, lInf);
 }

 TEST_P(DNNTestNetwork, OpenPose_pose_mpi)
 {
    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 4e-5 : 1e-5;
+    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 7e-3 : 1e-4;
    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt",
-               Size(368, 368), "");
+               Size(368, 368), "", "", l1, lInf);
 }

 TEST_P(DNNTestNetwork, OpenPose_pose_mpi_faster_4_stages)
 {
    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    double l1 = target == DNN_TARGET_OPENCL_FP16 ? 5e-5 : 1e-5;
+    double lInf = target == DNN_TARGET_OPENCL_FP16 ? 5e-3 : 1e-4;
    // The same .caffemodel but modified .prototxt
    // See https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp
    processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi_faster_4_stages.prototxt",
-               Size(368, 368), "");
+               Size(368, 368), "", "", l1, lInf);
 }

 TEST_P(DNNTestNetwork, OpenFace)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    processNet("dnn/openface_nn4.small2.v1.t7", "", Size(96, 96), "");
 }

 TEST_P(DNNTestNetwork, opencv_face_detector)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    Mat img = imread(findDataFile("gpu/lbpcascade/er.png", false));
    Mat inp = blobFromImage(img, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
    processNet("dnn/opencv_face_detector.caffemodel", "dnn/opencv_face_detector.prototxt",
@ -214,13 +236,23 @@ TEST_P(DNNTestNetwork, opencv_face_detector)

 TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
-    if (backend == DNN_BACKEND_HALIDE) throw SkipTestException("");
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target != DNN_TARGET_CPU)
+        throw SkipTestException("");
    Mat sample = imread(findDataFile("dnn/street.png", false));
    Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
    processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "dnn/ssd_inception_v2_coco_2017_11_17.pbtxt",
               inp, "detection_out");
 }

+TEST_P(DNNTestNetwork, DenseNet_121)
+{
+    if (backend == DNN_BACKEND_HALIDE ||
+        backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("");
+    processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "caffe");
+}
+
 const tuple<DNNBackend, DNNTarget> testCases[] = {
 #ifdef HAVE_HALIDE
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_HALIDE, DNN_TARGET_CPU),
@ -228,6 +260,8 @@ const tuple<DNNBackend, DNNTarget> testCases[] = {
 #endif
 #ifdef HAVE_INF_ENGINE
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL),
+    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16),
 #endif
    tuple<DNNBackend, DNNTarget>(DNN_BACKEND_DEFAULT, DNN_TARGET_OPENCL)
 };
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -866,6 +866,44 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy)

    normAssert(outDefault, out);
 }
+
+// 1. Create a .prototxt file with the following network:
+// layer {
+//   type: "Input" name: "data" top: "data"
+//   input_param { shape { dim: 1 dim: 2 dim: 3 } }
+// }
+// layer {
+//   type: "Input" name: "second_input" top: "second_input"
+//   input_param { shape { dim: 1 dim: 2 dim: 3 } }
+// }
+// layer {
+//  type: "Eltwise" name: "output" top: "output"
+//  bottom: "data" bottom: "second_input"
+//  eltwise_param { operation: SUM }
+// }
+//
+// 2. Create a .caffemodel file using Caffe:
+//
+// import caffe
+// net = caffe.Net('/path/to/prototxt', caffe.TEST)
+// net.save('/path/to/caffemodel')
+//
+// 3. Convert using ModelOptimizer.
+TEST(Test_DLDT, two_inputs)
+{
+    Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin"));
+    int inpSize[] = {1, 2, 3};
+    Mat firstInp(3, &inpSize[0], CV_32F);
+    Mat secondInp(3, &inpSize[0], CV_32F);
+    randu(firstInp, -1, 1);
+    randu(secondInp, -1, 1);
+
+    net.setInput(firstInp, "data");
+    net.setInput(secondInp, "second_input");
+    Mat out = net.forward();
+
+    normAssert(out, firstInp + secondInp);
+}
 #endif  // HAVE_INF_ENGINE

 }} // namespace
--- a/modules/dnn/test/test_precomp.hpp
+++ b/modules/dnn/test/test_precomp.hpp
@ -53,7 +53,7 @@ namespace opencv_test {
 using namespace cv::dnn;

 CV_ENUM(DNNBackend, DNN_BACKEND_DEFAULT, DNN_BACKEND_HALIDE, DNN_BACKEND_INFERENCE_ENGINE)
-CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL)
+CV_ENUM(DNNTarget, DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16)

 static testing::internal::ParamGenerator<DNNTarget> availableDnnTargets()
 {
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@ -193,6 +193,13 @@ TEST_P(Test_TensorFlow_layers, reshape)
    runTensorFlowNet("unfused_flatten_unknown_batch", targetId);
 }

+TEST_P(Test_TensorFlow_layers, l2_normalize)
+{
+    int targetId = GetParam();
+    runTensorFlowNet("l2_normalize", targetId);
+    runTensorFlowNet("l2_normalize_3d", targetId);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_layers, availableDnnTargets());

 typedef testing::TestWithParam<DNNTarget> Test_TensorFlow_nets;
--- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@ -229,7 +229,7 @@ OCL_PERF_TEST_P(RemapFixture, Remap,

    OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);

-    SANITY_CHECK(dst, eps);
+    SANITY_CHECK_NOTHING();
 }

 } } // namespace opencv_test::ocl
--- a/modules/imgproc/perf/perf_warp.cpp
+++ b/modules/imgproc/perf/perf_warp.cpp
@ -202,8 +202,8 @@ PERF_TEST_P( TestWarpPerspectiveNear_t, WarpPerspectiveNear,

 PERF_TEST_P( TestRemap, remap,
             Combine(
-                 Values( TYPICAL_MAT_TYPES ),
-                 Values( szVGA, sz720p, sz1080p ),
+                 Values( CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 ),
+                 Values( szVGA, sz1080p ),
                 InterType::all(),
                 BorderMode::all(),
                 RemapMode::all()
@ -231,7 +231,7 @@ PERF_TEST_P( TestRemap, remap,
        remap(source, destination, map_x, map_y, interpolationType, borderMode);
    }

-    SANITY_CHECK(destination, 1);
+    SANITY_CHECK_NOTHING();
 }

 void update_map(const Mat& src, Mat& map_x, Mat& map_y, const int remapMode )
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@ -547,7 +547,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
    size_t src_step = _src.step(), src_offset = _src.offset();
    const size_t tileSizeYmax = wgs / tileSizeX;

-    // workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory
+    // workaround for NVIDIA: 3 channel vector type takes 4*elem_size in local memory
    int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;

    if (((src_offset % src_step) % esz == 0) &&
--- a/modules/imgproc/src/fixedpoint.inl.hpp
+++ b/modules/imgproc/src/fixedpoint.inl.hpp
@ -36,19 +36,25 @@ public:
    typedef fixedpoint64 WT;
    CV_ALWAYS_INLINE fixedpoint64() { val = 0; }
    CV_ALWAYS_INLINE fixedpoint64(const int8_t& _val) { val = ((int64_t)_val) << fixedShift; }
+    CV_ALWAYS_INLINE fixedpoint64(const uint8_t& _val) { val = ((int64_t)_val) << fixedShift; }
    CV_ALWAYS_INLINE fixedpoint64(const int16_t& _val) { val = ((int64_t)_val) << fixedShift; }
+    CV_ALWAYS_INLINE fixedpoint64(const uint16_t& _val) { val = ((int64_t)_val) << fixedShift; }
    CV_ALWAYS_INLINE fixedpoint64(const int32_t& _val) { val = ((int64_t)_val) << fixedShift; }
    CV_ALWAYS_INLINE fixedpoint64(const cv::softdouble& _val) { val = cvRound64(_val * cv::softdouble((int64_t)(1LL << fixedShift))); }
    CV_ALWAYS_INLINE fixedpoint64& operator = (const int8_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
+    CV_ALWAYS_INLINE fixedpoint64& operator = (const uint8_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
    CV_ALWAYS_INLINE fixedpoint64& operator = (const int16_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
+    CV_ALWAYS_INLINE fixedpoint64& operator = (const uint16_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
    CV_ALWAYS_INLINE fixedpoint64& operator = (const int32_t& _val) { val = ((int64_t)_val) << fixedShift; return *this; }
    CV_ALWAYS_INLINE fixedpoint64& operator = (const cv::softdouble& _val) { val = cvRound64(_val * cv::softdouble((int64_t)(1LL << fixedShift))); return *this; }
    CV_ALWAYS_INLINE fixedpoint64& operator = (const fixedpoint64& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE fixedpoint64 operator * (const ET& val2) const { return val * val2; } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const int8_t& val2) const { return operator *(fixedpoint64(val2)); }
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const uint8_t& val2) const { return operator *(fixedpoint64(val2)); }
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const int16_t& val2) const { return operator *(fixedpoint64(val2)); }
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const uint16_t& val2) const { return operator *(fixedpoint64(val2)); }
+    CV_ALWAYS_INLINE fixedpoint64 operator * (const int32_t& val2) const { return operator *(fixedpoint64(val2)); }
    CV_ALWAYS_INLINE fixedpoint64 operator * (const fixedpoint64& val2) const
    {
-        //Assume -0x00000000C0000000 <= val2 <=0x0000000100000000 INT64_MIN <= val <= INT64_MAX, so shifted multiplication result is inside [INT64_MIN, INT64_MAX] range
        uint64_t uval = (uint64_t)((val ^ (val >> 63)) - (val >> 63));
        uint64_t umul = (uint64_t)((val2.val ^ (val2.val >> 63)) - (val2.val >> 63));
        int64_t ressign = (val >> 63) ^ (val2.val >> 63);
@ -61,6 +67,9 @@ public:
        uint64_t val0_h = (sh2   & 0xFFFFFFFF) + (sh1_0 >> 32) + (sh1_1 >> 32) + (val0_l >> 32);
        val0_l &= 0xFFFFFFFF;

+        if ( (sh2 >> 32) || (val0_h >> ressign ? 32 : 31) )
+            return (ressign ? ~(int64_t)0x7FFFFFFFFFFFFFFF : (int64_t)0x7FFFFFFFFFFFFFFF);
+
        if (ressign)
        {
            val0_l = (~val0_l + 1) & 0xFFFFFFFF;
@ -68,16 +77,19 @@ public:
        }
        return (int64_t)(val0_h << 32 | val0_l);
    }
-    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const { return fixedpoint64(val + val2.val); }
-    CV_ALWAYS_INLINE fixedpoint64 operator - (const fixedpoint64& val2) const { return fixedpoint64(val - val2.val); }
-    //    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const
-    //    {
-    //        int64_t nfrac = (int64_t)frac + val2.frac;
-    //        int64_t nval = (int64_t)val + val2.val + nfrac >> 32;
-    //        return nval > MAXINT32 ? beConv(MAXINT32, MAXINT32) : beConv((int32_t)(nval), 0);
-    //    }
+    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const
+    {
+        int64_t res = val + val2.val;
+        return ((val ^ res) & (val2.val ^ res)) >> 63 ? ~(res & ~0x7FFFFFFFFFFFFFFF) : res;
+    }
+    CV_ALWAYS_INLINE fixedpoint64 operator - (const fixedpoint64& val2) const
+    {
+        int64_t res = val - val2.val;
+        return ((val ^ val2.val) & (val ^ res)) >> 63 ? ~(res & ~0x7FFFFFFFFFFFFFFF) : res;
+    }
    CV_ALWAYS_INLINE fixedpoint64 operator >> (int n) const { return fixedpoint64(val >> n); }
    CV_ALWAYS_INLINE fixedpoint64 operator << (int n) const { return fixedpoint64(val << n); }
+    CV_ALWAYS_INLINE bool operator == (const fixedpoint64& val2) const { return val == val2.val; }
    template <typename ET>
    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int64_t)fixedround((uint64_t)val) >> fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1LL << fixedShift); }
@ -108,31 +120,36 @@ public:
    CV_ALWAYS_INLINE ufixedpoint64& operator = (const uint32_t& _val) { val = ((uint64_t)_val) << fixedShift; return *this; }
    CV_ALWAYS_INLINE ufixedpoint64& operator = (const cv::softdouble& _val) { val = _val.getSign() ? 0 : (uint64_t)cvRound64(_val * cv::softdouble((int64_t)(1LL << fixedShift))); return *this; }
    CV_ALWAYS_INLINE ufixedpoint64& operator = (const ufixedpoint64& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE ufixedpoint64 operator * (const ET& val2) const { return val * val2; } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE ufixedpoint64 operator * (const uint8_t& val2) const { return operator *(ufixedpoint64(val2)); }
+    CV_ALWAYS_INLINE ufixedpoint64 operator * (const uint16_t& val2) const { return operator *(ufixedpoint64(val2)); }
+    CV_ALWAYS_INLINE ufixedpoint64 operator * (const uint32_t& val2) const { return operator *(ufixedpoint64(val2)); }
    CV_ALWAYS_INLINE ufixedpoint64 operator * (const ufixedpoint64& val2) const
    {
-        //Assume val2 <=0x0000000100000000, so shifted multiplication result is less than val and therefore than UINT64_MAX
        uint64_t sh0 = fixedround((val & 0xFFFFFFFF) * (val2.val & 0xFFFFFFFF));
        uint64_t sh1_0 = (val >> 32)        * (val2.val & 0xFFFFFFFF);
        uint64_t sh1_1 = (val & 0xFFFFFFFF) * (val2.val >> 32);
-        uint64_t sh2 = (val >> 32)        * (val2.val >> 32);
+        uint64_t sh2   = (val >> 32)        * (val2.val >> 32);
        uint64_t val0_l = (sh1_0 & 0xFFFFFFFF) + (sh1_1 & 0xFFFFFFFF) + (sh0 >> 32);
        uint64_t val0_h = (sh2 & 0xFFFFFFFF) + (sh1_0 >> 32) + (sh1_1 >> 32) + (val0_l >> 32);
        val0_l &= 0xFFFFFFFF;

+        if ((sh2 >> 32) || (val0_h >> 32))
+            return ((uint64_t)0xFFFFFFFFFFFFFFFF);
+
        return val0_h << 32 | val0_l;
    }
-    CV_ALWAYS_INLINE ufixedpoint64 operator + (const ufixedpoint64& val2) const { return ufixedpoint64(val + val2.val); }
-    CV_ALWAYS_INLINE ufixedpoint64 operator - (const ufixedpoint64& val2) const { return ufixedpoint64(val - val2.val); }
-    //    CV_ALWAYS_INLINE fixedpoint64 operator + (const fixedpoint64& val2) const
-    //    {
-    //        int64_t nfrac = (int64_t)frac + val2.frac;
-    //        int64_t nval = (int64_t)val + val2.val + nfrac >> 32;
-    //        return nval > MAXINT32 ? beConv(MAXINT32, MAXINT32) : beConv((int32_t)(nval), 0);
-    //    }
+    CV_ALWAYS_INLINE ufixedpoint64 operator + (const ufixedpoint64& val2) const
+    {
+        uint64_t res = val + val2.val;
+        return (val > res) ? (uint64_t)0xFFFFFFFFFFFFFFFF : res;
+    }
+    CV_ALWAYS_INLINE ufixedpoint64 operator - (const ufixedpoint64& val2) const
+    {
+        return val > val2.val ? (val - val2.val) : 0;
+    }
    CV_ALWAYS_INLINE ufixedpoint64 operator >> (int n) const { return ufixedpoint64(val >> n); }
    CV_ALWAYS_INLINE ufixedpoint64 operator << (int n) const { return ufixedpoint64(val << n); }
+    CV_ALWAYS_INLINE bool operator == (const ufixedpoint64& val2) const { return val == val2.val; }
    template <typename ET>
    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1LL << fixedShift); }
@ -163,21 +180,26 @@ public:
    CV_ALWAYS_INLINE fixedpoint32& operator = (const int16_t& _val) { val = ((int32_t)_val) << fixedShift; return *this; }
    CV_ALWAYS_INLINE fixedpoint32& operator = (const cv::softdouble& _val) { val = (int32_t)cvRound(_val * cv::softdouble((1 << fixedShift))); return *this; }
    CV_ALWAYS_INLINE fixedpoint32& operator = (const fixedpoint32& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE fixedpoint32 operator * (const ET& val2) const { return val * val2; } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE fixedpoint32 operator * (const int8_t& val2) const { return cv::saturate_cast<int32_t>((int64_t)val * val2); }
+    CV_ALWAYS_INLINE fixedpoint32 operator * (const uint8_t& val2) const { return cv::saturate_cast<int32_t>((int64_t)val * val2); }
+    CV_ALWAYS_INLINE fixedpoint32 operator * (const int16_t& val2) const { return cv::saturate_cast<int32_t>((int64_t)val * val2); }
    CV_ALWAYS_INLINE fixedpoint64 operator * (const fixedpoint32& val2) const { return (int64_t)val * (int64_t)(val2.val); }
-    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const { return fixedpoint32(val + val2.val); }
-    CV_ALWAYS_INLINE fixedpoint32 operator - (const fixedpoint32& val2) const { return fixedpoint32(val - val2.val); }
-    //    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const
-    //    {
-    //        int32_t nfrac = (int32_t)frac + val2.frac;
-    //        int32_t nval = (int32_t)val + val2.val + nfrac >> 32;
-    //        return nval > MAXINT32 ? beConv(MAXINT32, MAXINT32) : beConv((int32_t)(nval), 0);
-    //    }
+    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const
+    {
+        int32_t res = val + val2.val;
+        return ((val ^ res) & (val2.val ^ res)) >> 31 ? ~(res & ~0x7FFFFFFF) : res;
+    }
+    CV_ALWAYS_INLINE fixedpoint32 operator - (const fixedpoint32& val2) const
+    {
+        int32_t res = val - val2.val;
+        return ((val ^ val2.val) & (val ^ res)) >> 31 ? ~(res & ~0x7FFFFFFF) : res;
+    }
    CV_ALWAYS_INLINE fixedpoint32 operator >> (int n) const { return fixedpoint32(val >> n); }
    CV_ALWAYS_INLINE fixedpoint32 operator << (int n) const { return fixedpoint32(val << n); }
+    CV_ALWAYS_INLINE bool operator == (const fixedpoint32& val2) const { return val == val2.val; }
    template <typename ET>
    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int32_t)fixedround((uint32_t)val) >> fixedShift); }
+    CV_ALWAYS_INLINE operator fixedpoint64() const { return (int64_t)val << (fixedpoint64::fixedShift - fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
@ -204,21 +226,24 @@ public:
    CV_ALWAYS_INLINE ufixedpoint32& operator = (const uint16_t& _val) { val = ((uint32_t)_val) << fixedShift; return *this; }
    CV_ALWAYS_INLINE ufixedpoint32& operator = (const cv::softdouble& _val) { val = _val.getSign() ? 0 : (uint32_t)cvRound(_val * cv::softdouble((1 << fixedShift))); return *this; }
    CV_ALWAYS_INLINE ufixedpoint32& operator = (const ufixedpoint32& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE ufixedpoint32 operator * (const ET& val2) const { return val * val2; } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE ufixedpoint32 operator * (const uint8_t& val2) const { return cv::saturate_cast<uint32_t>((uint64_t)val * val2); }
+    CV_ALWAYS_INLINE ufixedpoint32 operator * (const uint16_t& val2) const { return cv::saturate_cast<uint32_t>((uint64_t)val * val2); }
    CV_ALWAYS_INLINE ufixedpoint64 operator * (const ufixedpoint32& val2) const { return (uint64_t)val * (uint64_t)(val2.val); }
-    CV_ALWAYS_INLINE ufixedpoint32 operator + (const ufixedpoint32& val2) const { return ufixedpoint32(val + val2.val); }
-    CV_ALWAYS_INLINE ufixedpoint32 operator - (const ufixedpoint32& val2) const { return ufixedpoint32(val - val2.val); }
-    //    CV_ALWAYS_INLINE fixedpoint32 operator + (const fixedpoint32& val2) const
-    //    {
-    //        int32_t nfrac = (int32_t)frac + val2.frac;
-    //        int32_t nval = (int32_t)val + val2.val + nfrac >> 32;
-    //        return nval > MAXINT32 ? beConv(MAXINT32, MAXINT32) : beConv((int32_t)(nval), 0);
-    //    }
+    CV_ALWAYS_INLINE ufixedpoint32 operator + (const ufixedpoint32& val2) const
+    {
+        uint32_t res = val + val2.val;
+        return (val > res) ? 0xFFFFFFFF : res;
+    }
+    CV_ALWAYS_INLINE ufixedpoint32 operator - (const ufixedpoint32& val2) const
+    {
+        return val > val2.val ? (val - val2.val) : 0;
+    }
    CV_ALWAYS_INLINE ufixedpoint32 operator >> (int n) const { return ufixedpoint32(val >> n); }
    CV_ALWAYS_INLINE ufixedpoint32 operator << (int n) const { return ufixedpoint32(val << n); }
+    CV_ALWAYS_INLINE bool operator == (const ufixedpoint32& val2) const { return val == val2.val; }
    template <typename ET>
    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
+    CV_ALWAYS_INLINE operator ufixedpoint64() const { return (uint64_t)val << (ufixedpoint64::fixedShift - fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
@ -239,20 +264,28 @@ public:
    typedef fixedpoint32 WT;
    CV_ALWAYS_INLINE fixedpoint16() { val = 0; }
    CV_ALWAYS_INLINE fixedpoint16(const int8_t& _val) { val = ((int16_t)_val) << fixedShift; }
-    CV_ALWAYS_INLINE fixedpoint16(const uint8_t& _val) { val = ((int16_t)_val) << fixedShift; }
    CV_ALWAYS_INLINE fixedpoint16(const cv::softdouble& _val) { val = (int16_t)cvRound(_val * cv::softdouble((1 << fixedShift))); }
    CV_ALWAYS_INLINE fixedpoint16& operator = (const int8_t& _val) { val = ((int16_t)_val) << fixedShift; return *this; }
    CV_ALWAYS_INLINE fixedpoint16& operator = (const cv::softdouble& _val) { val = (int16_t)cvRound(_val * cv::softdouble((1 << fixedShift))); return *this; }
    CV_ALWAYS_INLINE fixedpoint16& operator = (const fixedpoint16& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE fixedpoint16 operator * (const ET& val2) const { return (int16_t)(val * val2); } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE fixedpoint16 operator * (const int8_t& val2) const { return cv::saturate_cast<int16_t>((int32_t)val * val2); }
    CV_ALWAYS_INLINE fixedpoint32 operator * (const fixedpoint16& val2) const { return (int32_t)val * (int32_t)(val2.val); }
-    CV_ALWAYS_INLINE fixedpoint16 operator + (const fixedpoint16& val2) const { return fixedpoint16((int16_t)(val + val2.val)); }
-    CV_ALWAYS_INLINE fixedpoint16 operator - (const fixedpoint16& val2) const { return fixedpoint16((int16_t)(val - val2.val)); }
+    CV_ALWAYS_INLINE fixedpoint16 operator + (const fixedpoint16& val2) const
+    {
+        int16_t res = val + val2.val;
+        return ((val ^ res) & (val2.val ^ res)) >> 15 ? (int16_t)(~(res & ~0x7FFF)) : res;
+    }
+    CV_ALWAYS_INLINE fixedpoint16 operator - (const fixedpoint16& val2) const
+    {
+        int16_t res = val - val2.val;
+        return ((val ^ val2.val) & (val ^ res)) >> 15 ? (int16_t)(~(res & ~(int16_t)0x7FFF)) : res;
+    }
    CV_ALWAYS_INLINE fixedpoint16 operator >> (int n) const { return fixedpoint16((int16_t)(val >> n)); }
    CV_ALWAYS_INLINE fixedpoint16 operator << (int n) const { return fixedpoint16((int16_t)(val << n)); }
+    CV_ALWAYS_INLINE bool operator == (const fixedpoint16& val2) const { return val == val2.val; }
    template <typename ET>
    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>((int16_t)fixedround((uint16_t)val) >> fixedShift); }
+    CV_ALWAYS_INLINE operator fixedpoint32() const { return (int32_t)val << (fixedpoint32::fixedShift - fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
@ -276,15 +309,23 @@ public:
    CV_ALWAYS_INLINE ufixedpoint16& operator = (const uint8_t& _val) { val = ((uint16_t)_val) << fixedShift; return *this; }
    CV_ALWAYS_INLINE ufixedpoint16& operator = (const cv::softdouble& _val) { val = _val.getSign() ? 0 : (uint16_t)cvRound(_val * cv::softdouble((int32_t)(1 << fixedShift))); return *this; }
    CV_ALWAYS_INLINE ufixedpoint16& operator = (const ufixedpoint16& _val) { val = _val.val; return *this; }
-    template <typename ET>
-    CV_ALWAYS_INLINE ufixedpoint16 operator * (const ET& val2) const { return (uint16_t)(val * val2); } // Wrong rounding is possible for floating point types
+    CV_ALWAYS_INLINE ufixedpoint16 operator * (const uint8_t& val2) const { return cv::saturate_cast<uint16_t>((uint32_t)val * val2); }
    CV_ALWAYS_INLINE ufixedpoint32 operator * (const ufixedpoint16& val2) const { return ((uint32_t)val * (uint32_t)(val2.val)); }
-    CV_ALWAYS_INLINE ufixedpoint16 operator + (const ufixedpoint16& val2) const { return ufixedpoint16((uint16_t)(val + val2.val)); }
-    CV_ALWAYS_INLINE ufixedpoint16 operator - (const ufixedpoint16& val2) const { return ufixedpoint16((uint16_t)(val - val2.val)); }
+    CV_ALWAYS_INLINE ufixedpoint16 operator + (const ufixedpoint16& val2) const
+    {
+        uint16_t res = val + val2.val;
+        return (val > res) ? (uint16_t)0xFFFF : res;
+    }
+    CV_ALWAYS_INLINE ufixedpoint16 operator - (const ufixedpoint16& val2) const
+    {
+        return val > val2.val ? (uint16_t)(val - val2.val) : (uint16_t)0;
+    }
    CV_ALWAYS_INLINE ufixedpoint16 operator >> (int n) const { return ufixedpoint16((uint16_t)(val >> n)); }
    CV_ALWAYS_INLINE ufixedpoint16 operator << (int n) const { return ufixedpoint16((uint16_t)(val << n)); }
+    CV_ALWAYS_INLINE bool operator == (const ufixedpoint16& val2) const { return val == val2.val; }
    template <typename ET>
    CV_ALWAYS_INLINE operator ET() const { return cv::saturate_cast<ET>(fixedround(val) >> fixedShift); }
+    CV_ALWAYS_INLINE operator ufixedpoint32() const { return (uint32_t)val << (ufixedpoint32::fixedShift - fixedShift); }
    CV_ALWAYS_INLINE operator double() const { return (double)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE operator float() const { return (float)val / (1 << fixedShift); }
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
--- a/modules/imgproc/test/test_imgwarp_strict.cpp
+++ b/modules/imgproc/test/test_imgwarp_strict.cpp
@ -77,6 +77,7 @@ protected:

    virtual void run_func() = 0;
    virtual void run_reference_func() = 0;
+    virtual float get_success_error_level(int _interpolation, int _depth) const;
    virtual void validate_results() const;
    virtual void prepare_test_data_for_reference_func();

@ -229,6 +230,20 @@ void CV_ImageWarpBaseTest::run(int)
    ts->set_gtest_status();
 }

+float CV_ImageWarpBaseTest::get_success_error_level(int _interpolation, int) const
+{
+    if (_interpolation == INTER_CUBIC)
+        return 1.0f;
+    else if (_interpolation == INTER_LANCZOS4)
+        return 1.0f;
+    else if (_interpolation == INTER_NEAREST)
+        return 1.0f;
+    else if (_interpolation == INTER_AREA)
+        return 2.0f;
+    else
+        return 1.0f;
+}
+
 void CV_ImageWarpBaseTest::validate_results() const
 {
    Mat _dst;
@ -237,15 +252,7 @@ void CV_ImageWarpBaseTest::validate_results() const
    Size dsize = dst.size(), ssize = src.size();
    int cn = _dst.channels();
    dsize.width *= cn;
-    float t = 1.0f;
-    if (interpolation == INTER_CUBIC)
-        t = 1.0f;
-    else if (interpolation == INTER_LANCZOS4)
-        t = 1.0f;
-    else if (interpolation == INTER_NEAREST)
-        t = 1.0f;
-    else if (interpolation == INTER_AREA)
-        t = 2.0f;
+    float t = get_success_error_level(interpolation & INTER_MAX, dst.depth());

    for (int dy = 0; dy < dsize.height; ++dy)
    {
@ -1034,7 +1041,7 @@ public:

 protected:
    virtual void generate_test_data();
-    virtual void prepare_test_data_for_reference_func();
+    virtual float get_success_error_level(int _interpolation, int _depth) const;

    virtual void run_func();
    virtual void run_reference_func();
@ -1083,16 +1090,16 @@ void CV_WarpAffine_Test::run_func()
    cv::warpAffine(src, dst, M, dst.size(), interpolation, borderType, borderValue);
 }

-void CV_WarpAffine_Test::prepare_test_data_for_reference_func()
+float CV_WarpAffine_Test::get_success_error_level(int _interpolation, int _depth) const
 {
-    CV_ImageWarpBaseTest::prepare_test_data_for_reference_func();
+    return _depth == CV_8U ? 0 : CV_ImageWarpBaseTest::get_success_error_level(_interpolation, _depth);
 }

 void CV_WarpAffine_Test::run_reference_func()
 {
-    prepare_test_data_for_reference_func();
-
-    warpAffine(src, reference_dst);
+    Mat tmp = Mat::zeros(dst.size(), dst.type());
+    warpAffine(src, tmp);
+    tmp.convertTo(reference_dst, reference_dst.depth());
 }

 void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst)
@ -1123,7 +1130,7 @@ void CV_WarpAffine_Test::warpAffine(const Mat& _src, Mat& _dst)
    const int AB_SCALE = 1 << AB_BITS;
    int round_delta = (inter == INTER_NEAREST) ? AB_SCALE / 2 : (AB_SCALE / INTER_TAB_SIZE / 2);

-    const double* data_tM = tM.ptr<double>(0);
+    const softdouble* data_tM = tM.ptr<softdouble>(0);
    for (int dy = 0; dy < dsize.height; ++dy)
    {
        short* yM = mapx.ptr<short>(dy);
@ -1162,6 +1169,7 @@ public:

 protected:
    virtual void generate_test_data();
+    virtual float get_success_error_level(int _interpolation, int _depth) const;

    virtual void run_func();
    virtual void run_reference_func();
@ -1204,11 +1212,16 @@ void CV_WarpPerspective_Test::run_func()
    cv::warpPerspective(src, dst, M, dst.size(), interpolation, borderType, borderValue);
 }

-void CV_WarpPerspective_Test::run_reference_func()
+float CV_WarpPerspective_Test::get_success_error_level(int _interpolation, int _depth) const
 {
-    prepare_test_data_for_reference_func();
+    return CV_ImageWarpBaseTest::get_success_error_level(_interpolation, _depth);
+}

-    warpPerspective(src, reference_dst);
+void CV_WarpPerspective_Test::run_reference_func()
+{
+    Mat tmp = Mat::zeros(dst.size(), dst.type());
+    warpPerspective(src, tmp);
+    tmp.convertTo(reference_dst, reference_dst.depth());
 }

 void CV_WarpPerspective_Test::warpPerspective(const Mat& _src, Mat& _dst)
--- a/samples/dnn/face_detector/opencv_face_detector.pbtxt
+++ b/samples/dnn/face_detector/opencv_face_detector.pbtxt
@ -482,6 +482,7 @@ node {
  name: "conv4_3_norm/l2_normalize"
  op: "L2Normalize"
  input: "Relu_4:0"
+  input: "conv4_3_norm/l2_normalize/Sum/reduction_indices"
 }
 node {
  name: "conv4_3_norm/mul_1"