Merge remote-tracking branch 'upstream/3.4' into merge-3.4

3 years ago · 2ebdc04787
parent d09cc0f30c cc8add9f66
commit 2ebdc04787
37 changed files with 246 additions and 90 deletions
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
+cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)

 project(Carotene)

--- a/3rdparty/carotene/hal/CMakeLists.txt
+++ b/3rdparty/carotene/hal/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)

 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -46,7 +46,7 @@

 set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F")
 list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL")
-list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
+list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16 NEON_DOTPROD)
 list(APPEND CPU_ALL_OPTIMIZATIONS MSA)
 list(APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3)
 list(APPEND CPU_ALL_OPTIMIZATIONS RVV)
@ -329,6 +329,7 @@ if(X86 OR X86_64)
 elseif(ARM OR AARCH64)
  ocv_update(CPU_NEON_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_neon.cpp")
  ocv_update(CPU_FP16_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_fp16.cpp")
+  ocv_update(CPU_NEON_DOTPROD_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_dotprod.cpp")
  if(NOT AARCH64)
    ocv_update(CPU_KNOWN_OPTIMIZATIONS "VFPV3;NEON;FP16")
    if(NOT MSVC)
@ -340,9 +341,11 @@ elseif(ARM OR AARCH64)
    endif()
    ocv_update(CPU_FP16_IMPLIES "NEON")
  else()
-    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16")
+    ocv_update(CPU_KNOWN_OPTIMIZATIONS "NEON;FP16;NEON_DOTPROD")
    ocv_update(CPU_NEON_FLAGS_ON "")
    ocv_update(CPU_FP16_IMPLIES "NEON")
+    ocv_update(CPU_NEON_DOTPROD_FLAGS_ON "-march=armv8.2-a+dotprod")
+    ocv_update(CPU_NEON_DOTPROD_IMPLIES "NEON")
    set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
  endif()
 elseif(MIPS)
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -136,7 +136,7 @@ if(CV_GCC OR CV_CLANG)
  endif()
  add_extra_compiler_option(-Wsign-promo)
  add_extra_compiler_option(-Wuninitialized)
-  if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0))
+  if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0 OR ARM))
    add_extra_compiler_option(-Wno-psabi)
  endif()
  if(HAVE_CXX11)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -253,12 +253,13 @@ if(CUDA_FOUND)
      endif()
      if(NOT _nvcc_res EQUAL 0)
        message(STATUS "Automatic detection of CUDA generation failed. Going to build for all known architectures.")
-        # TX1 (5.3) TX2 (6.2) Xavier (7.2) V100 (7.0)
+        # TX1 (5.3) TX2 (6.2) Xavier (7.2) V100 (7.0) Orin (8.7)
        ocv_filter_available_architecture(__cuda_arch_bin
            5.3
            6.2
            7.2
            7.0
+            8.7
        )
      else()
        set(__cuda_arch_bin "${_nvcc_out}")
--- a/cmake/OpenCVDetectHalide.cmake
+++ b/cmake/OpenCVDetectHalide.cmake
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION ${MIN_VER_CMAKE})

 if(" ${HALIDE_ROOT_DIR}" STREQUAL " ")
  unset(HALIDE_ROOT_DIR CACHE)
--- a/cmake/checks/cpu_dotprod.cpp
+++ b/cmake/checks/cpu_dotprod.cpp
@ -0,0 +1,24 @@
+#include <stdio.h>
+
+#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
+#include "arm_neon.h"
+int test()
+{
+    const unsigned int src[] = { 0, 0, 0, 0 };
+    unsigned int dst[4];
+    uint32x4_t v_src = *(uint32x4_t*)src;
+    uint8x16_t v_m0 = *(uint8x16_t*)src;
+    uint8x16_t v_m1 = *(uint8x16_t*)src;
+    uint32x4_t v_dst = vdotq_u32(v_src, v_m0, v_m1);
+    *(uint32x4_t*)dst = v_dst;
+    return (int)dst[0];
+}
+#else
+#error "DOTPROD is not supported"
+#endif
+
+int main()
+{
+  printf("%d\n", test());
+  return 0;
+}
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@ -2692,7 +2692,7 @@ CV_EXPORTS_W int recoverPose( InputArray points1, InputArray points2,
                            InputOutputArray mask = noArray());

 /** @brief Recovers the relative camera rotation and the translation from an estimated essential
-matrix and the corresponding points in two images, using cheirality check. Returns the number of
+matrix and the corresponding points in two images, using chirality check. Returns the number of
 inliers that pass the check.

@param E The input essential matrix.
@ -2710,11 +2710,11 @@ described below.
 therefore is only known up to scale, i.e. t is the direction of the translation vector and has unit
 length.
@param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.

 This function decomposes an essential matrix using @ref decomposeEssentialMat and then verifies
-possible pose hypotheses by doing cheirality check. The cheirality check means that the
+possible pose hypotheses by doing chirality check. The chirality check means that the
 triangulated 3D points should have positive depth. Some details can be found in @cite Nister03.

 This function can be used to process the output E and mask from @ref findEssentialMat. In this
@ -2761,8 +2761,8 @@ length.
 are feature points from cameras with same focal length and principal point.
@param pp principal point of the camera.
@param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.

 This function differs from the one above that it computes camera intrinsic matrix from focal length and
 principal point:
@ -2797,12 +2797,12 @@ length.
@param distanceThresh threshold distance which is used to filter out far away points (i.e. infinite
 points).
@param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.
@param triangulatedPoints 3D points which were reconstructed by triangulation.

 This function differs from the one above that it outputs the triangulated 3D point that are used for
-the cheirality check.
+the chirality check.
 */
 CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray points2,
                            InputArray cameraMatrix, OutputArray R, OutputArray t, double distanceThresh, InputOutputArray mask = noArray(),
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@ -1232,7 +1232,7 @@ int ChessBoardDetector::cleanFoundConnectedQuads(std::vector<ChessBoardQuad*>& q
        centers[i] = ci;
        center += ci;
    }
-    center.x *= (1.0f / quad_count);
+    center *= (1.0f / quad_count);

    // If we still have more quadrangles than we should,
    // we try to eliminate bad ones based on minimizing the bounding box.
@ -1256,7 +1256,7 @@ int ChessBoardDetector::cleanFoundConnectedQuads(std::vector<ChessBoardQuad*>& q
            Mat points(1, quad_count, CV_32FC2, &centers[0]);
            cv::convexHull(points, hull, true);
            centers[skip] = temp;
-            double hull_area = contourArea(hull, true);
+            double hull_area = contourArea(hull, false);

            // remember smallest box area
            if (hull_area < min_box_area)
@ -1298,6 +1298,7 @@ int ChessBoardDetector::cleanFoundConnectedQuads(std::vector<ChessBoardQuad*>& q
        quad_group[min_box_area_index] = quad_group[quad_count];
        centers[min_box_area_index] = centers[quad_count];
    }
+    quad_group.resize(quad_count);

    return quad_count;
 }
--- a/modules/calib3d/src/five-point.cpp
+++ b/modules/calib3d/src/five-point.cpp
@ -601,7 +601,7 @@ int cv::recoverPose( InputArray E, InputArray _points1, InputArray _points2,
    P3(Range::all(), Range(0, 3)) = R1 * 1.0; P3.col(3) = -t * 1.0;
    P4(Range::all(), Range(0, 3)) = R2 * 1.0; P4.col(3) = -t * 1.0;

-    // Do the cheirality check.
+    // Do the chirality check.
    // Notice here a threshold dist is used to filter
    // out far away points (i.e. infinite points) since
    // their depth may vary between positive and negative.
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
 ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
 ocv_add_dispatched_file(convert_scale SSE2 AVX2)
 ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
-ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX)
+ocv_add_dispatched_file(matmul SSE2 SSE4_1 AVX2 AVX512_SKX NEON_DOTPROD)
 ocv_add_dispatched_file(mean SSE2 AVX2)
 ocv_add_dispatched_file(merge SSE2 AVX2)
 ocv_add_dispatched_file(split SSE2 AVX2)
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -79,6 +79,10 @@
 #  endif
 #  define CV_FP16 1
 #endif
+#ifdef CV_CPU_COMPILE_NEON_DOTPROD
+#  include <arm_neon.h>
+#  define CV_NEON_DOT 1
+#endif
 #ifdef CV_CPU_COMPILE_AVX2
 #  include <immintrin.h>
 #  define CV_AVX2 1
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -420,6 +420,27 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...)  CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_DOTPROD
+#  define CV_TRY_NEON_DOTPROD 1
+#  define CV_CPU_FORCE_NEON_DOTPROD 1
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 1
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args) return (opt_NEON_DOTPROD::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_DOTPROD
+#  define CV_TRY_NEON_DOTPROD 1
+#  define CV_CPU_FORCE_NEON_DOTPROD 0
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD (cv::checkHardwareSupport(CV_CPU_NEON_DOTPROD))
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#else
+#  define CV_TRY_NEON_DOTPROD 0
+#  define CV_CPU_FORCE_NEON_DOTPROD 0
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 0
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...)  CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
 #  define CV_TRY_MSA 1
 #  define CV_CPU_FORCE_MSA 1
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -268,6 +268,7 @@ namespace cv {
 #define CV_CPU_AVX_5124FMAPS    27

 #define CV_CPU_NEON             100
+#define CV_CPU_NEON_DOTPROD     101

 #define CV_CPU_MSA              150

@ -324,6 +325,7 @@ enum CpuFeatures {
    CPU_AVX_5124FMAPS   = 27,

    CPU_NEON            = 100,
+    CPU_NEON_DOTPROD    = 101,

    CPU_MSA             = 150,

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -78,8 +78,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_NEON_AARCH64 0
 #endif

-// TODO
-#define CV_NEON_DOT 0

 //////////// Utils ////////////

@ -665,11 +663,22 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64
 }

 // 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b)   \
+{ \
+    return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
+} \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+    return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4,  v_int8x16,  s32)
+#else
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
-#else
    const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
    const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
    const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
@ -685,23 +694,15 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
    uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
                              vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
    return v_uint32x4(vaddq_u32(s0, s1));
-#endif
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                   const v_uint32x4& c)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
-#else
    return v_dotprod_expand(a, b) + c;
-#endif
 }

 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
-#else
    int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
    int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
    int16x8_t uzp1, uzp2;
@ -710,18 +711,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
    int16x4_t uzpl1, uzpl2;
    _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
    return v_int32x4(vaddl_s16(uzpl1, uzpl2));
-#endif
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                  const v_int32x4& c)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
-#else
    return v_dotprod_expand(a, b) + c;
-#endif
 }
-
+#endif
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
@ -830,45 +826,44 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
 }

 // 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b)   \
+{ \
+    return v_dotprod_expand(a, b); \
+} \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+    return v_dotprod_expand(a, b, c); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4,  v_int8x16,  s32)
+#else
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
-#else
    uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
    uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
    uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
    uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
    return v_uint32x4(vaddq_u32(s0, s1));
-#endif
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
-#else
    return v_dotprod_expand_fast(a, b) + c;
-#endif
 }

 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
-#else
    int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
    prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
    return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
-#endif
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
-#else
    return v_dotprod_expand_fast(a, b) + c;
-#endif
 }
+#endif

 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
--- a/modules/core/src/buffer_area.cpp
+++ b/modules/core/src/buffer_area.cpp
@ -29,8 +29,7 @@ public:
    }
    void cleanup() const
    {
-        CV_Assert(ptr && *ptr);
-        *ptr = 0;
+        CV_DbgAssert(ptr);
        if (raw_mem)
            fastFree(raw_mem);
    }
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -305,6 +305,9 @@ DECLARE_CV_CPUID_X86
  #endif
 #endif

+#if defined CV_CXX11
+  #include <chrono>
+#endif

 namespace cv
 {
@ -414,6 +417,7 @@ struct HWFeatures
        g_hwFeatureNames[CPU_AVX_5124FMAPS] = "AVX5124FMAPS";

        g_hwFeatureNames[CPU_NEON] = "NEON";
+        g_hwFeatureNames[CPU_NEON_DOTPROD] = "NEON_DOTPROD";

        g_hwFeatureNames[CPU_VSX] = "VSX";
        g_hwFeatureNames[CPU_VSX3] = "VSX3";
@ -561,6 +565,24 @@ struct HWFeatures
    #ifdef __aarch64__
        have[CV_CPU_NEON] = true;
        have[CV_CPU_FP16] = true;
+        int cpufile = open("/proc/self/auxv", O_RDONLY);
+
+        if (cpufile >= 0)
+        {
+            Elf64_auxv_t auxv;
+            const size_t size_auxv_t = sizeof(auxv);
+
+            while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
+            {
+                if (auxv.a_type == AT_HWCAP)
+                {
+                    have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0;
+                    break;
+                }
+            }
+
+            close(cpufile);
+        }
    #elif defined __arm__ && defined __ANDROID__
      #if defined HAVE_CPUFEATURES
        CV_LOG_INFO(NULL, "calling android_getCpuFeatures() ...");
@ -853,7 +875,10 @@ bool useOptimized(void)

 int64 getTickCount(void)
 {
-#if defined _WIN32 || defined WINCE
+#if defined CV_CXX11
+    std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
+    return (int64)now.time_since_epoch().count();
+#elif defined _WIN32 || defined WINCE
    LARGE_INTEGER counter;
    QueryPerformanceCounter( &counter );
    return (int64)counter.QuadPart;
@ -872,7 +897,11 @@ int64 getTickCount(void)

 double getTickFrequency(void)
 {
-#if defined _WIN32 || defined WINCE
+#if defined CV_CXX11
+    using clock_period_t = std::chrono::steady_clock::duration::period;
+    double clock_freq = clock_period_t::den / clock_period_t::num;
+    return clock_freq;
+#elif defined _WIN32 || defined WINCE
    LARGE_INTEGER freq;
    QueryPerformanceFrequency(&freq);
    return (double)freq.QuadPart;
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@ -408,9 +408,6 @@ TEST_P(BufferArea, basic)
            EXPECT_EQ((double)0, dbl_ptr[i]);
        }
    }
-    EXPECT_TRUE(int_ptr == NULL);
-    EXPECT_TRUE(uchar_ptr == NULL);
-    EXPECT_TRUE(dbl_ptr == NULL);
 }

 TEST_P(BufferArea, align)
@ -447,10 +444,6 @@ TEST_P(BufferArea, align)
            }
        }
    }
-    for (size_t i = 0; i < CNT; ++i)
-    {
-        EXPECT_TRUE(buffers[i] == NULL);
-    }
 }

 TEST_P(BufferArea, default_align)
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@ -132,6 +132,16 @@ public:
        if (hasWeights && hasBias)
            CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs");

+        if (weights.total() == 1)
+        {
+            // The total() of bias should be same as weights.
+            if (hasBias)
+                inpBlob.convertTo(outBlob, CV_32F, weights.at<float>(0), bias.at<float>(0));
+            else
+                inpBlob.convertTo(outBlob, CV_32F, weights.at<float>(0));
+            return;
+        }
+
        int endAxis;
        for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis)
        {
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -2026,6 +2026,8 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node

 void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t& axis, int& broadAxis)
 {
+    // Currently, this function can only complete 1-dimensional expansion of broadShape.
+    // If there are two dimensions in broadShape that need to be expended, it will fail.
    const size_t diff = outShape.size() - broadShape.size();

    // find the first non-one element of the broadcasting shape
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -1060,6 +1060,8 @@ TEST_P(Test_ONNX_layers, Div)
    normAssert(ref, out, "", default_l1,  default_lInf);
    expectNoFallbacksFromIE(net);
    expectNoFallbacksFromCUDA(net);
+
+    testONNXModels("div_test_1x1",npy, 0, 0, false, true, 2);
 }

 TEST_P(Test_ONNX_layers, DynamicReshape)
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@ -981,11 +981,20 @@ else // CV_8U
        __pack01 = v_pack_u(v_round(__dst0 * __nrm2), v_round(__dst1 * __nrm2));
        v_pack_store(dst + k, __pack01);
    }
+#endif
+
+#if defined(__GNUC__) && __GNUC__ >= 9
+// avoid warning "iteration 7 invokes undefined behavior" on Linux ARM64
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations"
 #endif
    for( ; k < len; k++ )
    {
        dst[k] = saturate_cast<uchar>(rawDst[k]*nrm2);
    }
+#if defined(__GNUC__) && __GNUC__ >= 9
+#pragma GCC diagnostic pop
+#endif
 }
 #else
    float* dst = dstMat.ptr<float>(row);
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -78,11 +78,15 @@ Input depth (src.depth()) | Output depth (ddepth)
 --------------------------|----------------------
 CV_8U                     | -1/CV_16S/CV_32F/CV_64F
 CV_16U/CV_16S             | -1/CV_32F/CV_64F
-CV_32F                    | -1/CV_32F/CV_64F
+CV_32F                    | -1/CV_32F
 CV_64F                    | -1/CV_64F

@note when ddepth=-1, the output image will have the same depth as the source.

+@note if you need double floating-point accuracy and using single floating-point input data
+(CV_32F input and CV_64F output depth combination), you can use @ref Mat.convertTo to convert
+the input data to the desired precision.
+
    @defgroup imgproc_transform Geometric Image Transformations

 The functions in this section perform various geometrical transformations of 2D images. They do not
@ -1792,7 +1796,7 @@ with the following \f$3 \times 3\f$ aperture:

@param src Source image.
@param dst Destination image of the same size and the same number of channels as src .
-@param ddepth Desired depth of the destination image.
+@param ddepth Desired depth of the destination image, see @ref filter_depths "combinations".
@param ksize Aperture size used to compute the second-derivative filters. See #getDerivKernels for
 details. The size must be positive and odd.
@param scale Optional scale factor for the computed Laplacian values. By default, no scaling is
@ -2279,7 +2283,7 @@ case of multi-channel images, each channel is processed independently.
@param src input image; the number of channels can be arbitrary, but the depth should be one of
 CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
@param dst output image of the same size and type as src.
-@param kernel structuring element used for dilation; if elemenat=Mat(), a 3 x 3 rectangular
+@param kernel structuring element used for dilation; if element=Mat(), a 3 x 3 rectangular
 structuring element is used. Kernel can be created using #getStructuringElement
@param anchor position of the anchor within the element; default value (-1, -1) means that the
 anchor is at the element center.
@ -2809,7 +2813,7 @@ It makes possible to do a fast blurring or fast block correlation with a variabl
 example. In case of multi-channel images, sums for each channel are accumulated independently.

 As a practical example, the next figure shows the calculation of the integral of a straight
-rectangle Rect(3,3,3,2) and of a tilted rectangle Rect(5,1,2,3) . The selected pixels in the
+rectangle Rect(4,4,3,2) and of a tilted rectangle Rect(5,1,2,3) . The selected pixels in the
 original image are shown, as well as the relative pixels in the integral images sum and tilted .

 ![integral calculation example](pics/integral.png)
@ -3174,7 +3178,14 @@ CV_EXPORTS void calcHist( const Mat* images, int nimages,
                          const int* histSize, const float** ranges,
                          bool uniform = true, bool accumulate = false );

-/** @overload */
+/** @overload
+
+this variant supports only uniform histograms.
+
+ranges argument is either empty vector or a flattened vector of histSize.size()*2 elements
+(histSize.size() element pairs). The first and second elements of each pair specify the lower and
+upper boundaries.
+*/
 CV_EXPORTS_W void calcHist( InputArrayOfArrays images,
                            const std::vector<int>& channels,
                            InputArray mask, OutputArray hist,
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@ -1058,7 +1058,7 @@ EllipseEx( Mat& img, Point2l center, Size2l axes,
 *                                Polygons filling                                        *
 \****************************************************************************************/

-static inline void ICV_HLINE_X(uchar* ptr, int xl, int xr, const uchar* color, int pix_size)
+static inline void ICV_HLINE_X(uchar* ptr, int64_t xl, int64_t xr, const uchar* color, int pix_size)
 {
    uchar* hline_min_ptr = (uchar*)(ptr) + (xl)*(pix_size);
    uchar* hline_end_ptr = (uchar*)(ptr) + (xr+1)*(pix_size);
@ -1083,7 +1083,7 @@ static inline void ICV_HLINE_X(uchar* ptr, int xl, int xr, const uchar* color, i
 }
 //end ICV_HLINE_X()

-static inline void ICV_HLINE(uchar* ptr, int xl, int xr, const void* color, int pix_size)
+static inline void ICV_HLINE(uchar* ptr, int64_t xl, int64_t xr, const void* color, int pix_size)
 {
  ICV_HLINE_X(ptr, xl, xr, reinterpret_cast<const uchar*>(color), pix_size);
 }
@ -1177,7 +1177,7 @@ FillConvexPoly( Mat& img, const Point2l* v, int npts, const void* color, int lin
    edge[0].x = edge[1].x = -XY_ONE;
    edge[0].dx = edge[1].dx = 0;

-    ptr += img.step*y;
+    ptr += (int64_t)img.step*y;

    do
    {
@ -1206,7 +1206,7 @@ FillConvexPoly( Mat& img, const Point2l* v, int npts, const void* color, int lin
                            }

                            edge[i].ye = ty;
-                            edge[i].dx = ((xe - xs)*2 + (ty - y)) / (2 * (ty - y));
+                            edge[i].dx = ((xe - xs)*2 + ((int64_t)ty - y)) / (2 * ((int64_t)ty - y));
                            edge[i].x = xs;
                            edge[i].idx = idx;
                            break;
@ -1480,7 +1480,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
    size_t step = img.step;
    int pix_size = (int)img.elemSize();
    uchar* ptr = img.ptr();
-    int err = 0, dx = radius, dy = 0, plus = 1, minus = (radius << 1) - 1;
+    int64_t err = 0, dx = radius, dy = 0, plus = 1, minus = (radius << 1) - 1;
    int inside = center.x >= radius && center.x < size.width - radius &&
        center.y >= radius && center.y < size.height - radius;

@ -1490,8 +1490,8 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
    while( dx >= dy )
    {
        int mask;
-        int y11 = center.y - dy, y12 = center.y + dy, y21 = center.y - dx, y22 = center.y + dx;
-        int x11 = center.x - dx, x12 = center.x + dx, x21 = center.x - dy, x22 = center.x + dy;
+        int64_t y11 = center.y - dy, y12 = center.y + dy, y21 = center.y - dx, y22 = center.y + dx;
+        int64_t x11 = center.x - dx, x12 = center.x + dx, x21 = center.x - dy, x22 = center.x + dy;

        if( inside )
        {
@ -1531,7 +1531,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
        {
            if( fill )
            {
-                x11 = std::max( x11, 0 );
+                x11 = std::max( x11, (int64_t)0 );
                x12 = MIN( x12, size.width - 1 );
            }

@ -1569,7 +1569,7 @@ Circle( Mat& img, Point center, int radius, const void* color, int fill )
            {
                if( fill )
                {
-                    x21 = std::max( x21, 0 );
+                    x21 = std::max( x21, (int64_t)0 );
                    x22 = MIN( x22, size.width - 1 );
                }

@ -1866,6 +1866,12 @@ void rectangle( InputOutputArray img, Rect rec,
 {
    CV_INSTRUMENT_REGION();

+    CV_Assert( 0 <= shift && shift <= XY_SHIFT );
+
+    // Crop the rectangle to right around the mat.
+    rec &= Rect(-(1 << shift), -(1 << shift), ((img.cols() + 2) << shift),
+                ((img.rows() + 2) << shift));
+
    if( !rec.empty() )
        rectangle( img, rec.tl(), rec.br() - Point(1<<shift,1<<shift),
                   color, thickness, lineType, shift );
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@ -963,7 +963,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int)

                if (dsize.width > ssize.width*2)
                {
-                    row[(_dst.cols-1) + x] = row[dx + cn];
+                    row[(_dst.cols-1) * cn + x] = row[dx + cn];
                }
            }

--- a/modules/imgproc/test/test_pyramid.cpp
+++ b/modules/imgproc/test/test_pyramid.cpp
@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+TEST(Imgproc_PyrUp, pyrUp_regression_22184)
+{
+    Mat src(100, 100, CV_16UC3, Scalar::all(255));
+    Mat dst(100 * 2 + 1, 100 * 2 + 1, CV_16UC3, Scalar::all(0));
+    pyrUp(src, dst, Size(dst.cols, dst.rows));
+    double min_val = 0;
+    minMaxLoc(dst, &min_val);
+    ASSERT_GT(cvRound(min_val), 0);
+}
+
+}}  // namespace
--- a/modules/js/src/core_bindings.cpp
+++ b/modules/js/src/core_bindings.cpp
@ -453,6 +453,7 @@ namespace binding_utils
 EMSCRIPTEN_BINDINGS(binding_utils)
 {
    register_vector<int>("IntVector");
+    register_vector<char>("CharVector");
    register_vector<float>("FloatVector");
    register_vector<double>("DoubleVector");
    register_vector<cv::Point>("PointVector");
--- a/modules/objdetect/test/test_precomp.hpp
+++ b/modules/objdetect/test/test_precomp.hpp
@ -7,4 +7,10 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/objdetect.hpp"

+#if defined CV_CXX11
+  #include <random>
+#else
+  #include <cstdlib>
+#endif
+
 #endif
--- a/modules/objdetect/test/test_qrcode_encode.cpp
+++ b/modules/objdetect/test/test_qrcode_encode.cpp
@ -5,6 +5,16 @@
 #include "test_precomp.hpp"
 namespace opencv_test { namespace {

+#if !defined CV_CXX11
+// Wrapper for generating seeded random number via std::rand.
+template<unsigned Seed>
+class SeededRandFunctor {
+public:
+    SeededRandFunctor() { std::srand(Seed); }
+    int operator()(int i) { return std::rand() % (i + 1); }
+};
+#endif
+
 std::string encode_qrcode_images_name[] = {
        "version1_mode1.png", "version1_mode2.png", "version1_mode4.png",
        "version2_mode1.png", "version2_mode2.png", "version2_mode4.png",
@ -380,8 +390,15 @@ TEST(Objdetect_QRCode_Encode_Decode_Structured_Append, DISABLED_regression)
        std::string symbol_set = config["symbols_set"];

        std::string input_info = symbol_set;
-        std::random_shuffle(input_info.begin(), input_info.end());
-
+#if defined CV_CXX11
+        // std::random_shuffle is deprecated since C++11 and removed in C++17.
+        // Use manually constructed RNG with a fixed seed and std::shuffle instead.
+        std::mt19937 rand_gen {1};
+        std::shuffle(input_info.begin(), input_info.end(), rand_gen);
+#else
+        SeededRandFunctor<1> rand_gen;
+        std::random_shuffle(input_info.begin(), input_info.end(), rand_gen);
+#endif
        for (int j = min_stuctures_num; j < max_stuctures_num; j++)
        {
            QRCodeEncoder::Params params;
--- a/modules/python/CMakeLists.txt
+++ b/modules/python/CMakeLists.txt
@ -35,7 +35,7 @@ add_subdirectory(python3)

 else()  # standalone build

-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 2.8.12.2)
 project(OpenCVPython CXX C)
 include("./standalone.cmake")

--- a/modules/python/package/cv2/init.py
+++ b/modules/python/package/cv2/init.py
@ -89,7 +89,7 @@ def bootstrap():
    BINARIES_PATHS = []

    g_vars = globals()
-    l_vars = locals()
+    l_vars = locals().copy()

    if sys.version_info[:2] < (3, 0):
        from . load_config_py2 import exec_file_wrapper
--- a/modules/videoio/src/ffmpeg_codecs.hpp
+++ b/modules/videoio/src/ffmpeg_codecs.hpp
@ -60,6 +60,7 @@ extern "C" {
 #include <errno.h>
 #endif

+#include <libavcodec/version.h>
 #include <libavformat/avformat.h>

 #ifdef __cplusplus
--- a/platforms/android/android.toolchain.cmake
+++ b/platforms/android/android.toolchain.cmake
@ -189,7 +189,7 @@
 #
 # ------------------------------------------------------------------------------

-cmake_minimum_required( VERSION 2.6.3 )
+cmake_minimum_required( VERSION 2.8.12.2 )

 if( DEFINED CMAKE_CROSSCOMPILING )
 # subsequent toolchain loading is not really needed
--- a/samples/cpp/imgcodecs_jpeg.cpp
+++ b/samples/cpp/imgcodecs_jpeg.cpp
@ -17,7 +17,7 @@ int main(int /*argc*/, const char** /* argv */ )
    {
        const Point center( img.rows / 2 , img.cols /2 );

-        for( int radius = 5; radius < img.rows ; radius += 3.5 )
+        for( int radius = 5; radius < img.rows ; radius += 3 )
        {
            cv::circle( img, center, radius, Scalar(255,0,255) );
        }
--- a/samples/hal/c_hal/CMakeLists.txt
+++ b/samples/hal/c_hal/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+cmake_minimum_required(VERSION 2.8.12.2 FATAL_ERROR)

 set(PROJECT_NAME "c_hal")
 set(HAL_LIB_NAME "c_hal")
--- a/samples/hal/slow_hal/CMakeLists.txt
+++ b/samples/hal/slow_hal/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+cmake_minimum_required(VERSION 2.8.12.2 FATAL_ERROR)

 set(PROJECT_NAME "slow_hal")
 set(HAL_LIB_NAME "slow_hal")
--- a/samples/openvx/CMakeLists.txt
+++ b/samples/openvx/CMakeLists.txt
@ -1,6 +1,6 @@
 ocv_install_example_src(cpp *.cpp *.hpp CMakeLists.txt)

-cmake_minimum_required(VERSION 2.8.9)
+cmake_minimum_required(VERSION 2.8.12.2)

 set(OPENCV_OPENVX_SAMPLE_REQUIRED_DEPS
  opencv_core