Merge branch 4.x

pull/24254/head
Alexander Smorkalov 1 year ago
commit fdab565711
  1. 80
      3rdparty/libtengine/tengine.cmake
  2. 4
      3rdparty/readme.txt
  3. 14
      CMakeLists.txt
  4. 1
      apps/visualisation/opencv_visualisation.cpp
  5. 2
      cmake/OpenCVDetectPython.cmake
  6. 78
      cmake/OpenCVFindTengine.cmake
  7. 7
      cmake/mirrors/custom.cmake
  8. 5
      cmake/mirrors/gitcode.cmake
  9. 12
      doc/tutorials/introduction/config_reference/config_reference.markdown
  10. 72
      modules/3d/src/ap3p.cpp
  11. 3
      modules/3d/src/usac/essential_solver.cpp
  12. 4
      modules/3d/test/test_affine2d_estimator.cpp
  13. 4
      modules/3d/test/test_affine3d_estimator.cpp
  14. 4
      modules/3d/test/test_affine_partial2d_estimator.cpp
  15. 62
      modules/3d/test/test_solvepnp_ransac.cpp
  16. 4
      modules/3d/test/test_translation3d_estimator.cpp
  17. 20
      modules/core/CMakeLists.txt
  18. 7
      modules/core/include/opencv2/core.hpp
  19. 46
      modules/core/include/opencv2/core/cuda.hpp
  20. 4
      modules/core/include/opencv2/core/hal/intrin_neon.hpp
  21. 3
      modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
  22. 1
      modules/core/include/opencv2/core/opencl/opencl_info.hpp
  23. 3
      modules/core/include/opencv2/core/utils/filesystem.private.hpp
  24. 27
      modules/core/perf/perf_arithm.cpp
  25. 52
      modules/core/src/arithm.cpp
  26. 119
      modules/core/src/arithm.simd.hpp
  27. 2
      modules/core/src/check.cpp
  28. 116
      modules/core/src/convert.hpp
  29. 24
      modules/core/src/convert.simd.hpp
  30. 24
      modules/core/src/convert_scale.simd.hpp
  31. 18
      modules/core/src/copy.cpp
  32. 72
      modules/core/src/count_non_zero.simd.hpp
  33. 88
      modules/core/src/lapack.cpp
  34. 138
      modules/core/src/mathfuncs.cpp
  35. 128
      modules/core/src/matmul.simd.hpp
  36. 228
      modules/core/src/matrix_transform.cpp
  37. 4
      modules/core/src/mean.dispatch.cpp
  38. 1
      modules/core/src/merge.dispatch.cpp
  39. 20
      modules/core/src/merge.simd.hpp
  40. 2
      modules/core/src/minmax.cpp
  41. 66
      modules/core/src/norm.cpp
  42. 33
      modules/core/src/persistence.cpp
  43. 6
      modules/core/src/persistence_xml.cpp
  44. 20
      modules/core/src/split.simd.hpp
  45. 26
      modules/core/src/stat.simd.hpp
  46. 2
      modules/core/src/sum.dispatch.cpp
  47. 124
      modules/core/src/sum.simd.hpp
  48. 6
      modules/core/src/utils/filesystem.cpp
  49. 133
      modules/core/test/test_arithm.cpp
  50. 2
      modules/core/test/test_countnonzero.cpp
  51. 8
      modules/core/test/test_intrin_utils.hpp
  52. 14
      modules/dnn/CMakeLists.txt
  53. 7
      modules/dnn/include/opencv2/dnn/dnn.hpp
  54. 6
      modules/dnn/misc/python/test/test_dnn.py
  55. 4
      modules/dnn/perf/perf_caffe.cpp
  56. 1
      modules/dnn/perf/perf_layer.cpp
  57. 2
      modules/dnn/perf/perf_net.cpp
  58. 4
      modules/dnn/src/dnn_params.cpp
  59. 65
      modules/dnn/src/dnn_utils.cpp
  60. 21
      modules/dnn/src/ie_ngraph.cpp
  61. 6
      modules/dnn/src/ie_ngraph.hpp
  62. 2
      modules/dnn/src/layers/batch_norm_layer.cpp
  63. 2
      modules/dnn/src/layers/blank_layer.cpp
  64. 4
      modules/dnn/src/layers/concat_layer.cpp
  65. 97
      modules/dnn/src/layers/convolution_layer.cpp
  66. 2
      modules/dnn/src/layers/cpu_kernels/convolution.hpp
  67. 2
      modules/dnn/src/layers/crop_and_resize_layer.cpp
  68. 28
      modules/dnn/src/layers/elementwise_layers.cpp
  69. 15
      modules/dnn/src/layers/eltwise_layer.cpp
  70. 2
      modules/dnn/src/layers/flatten_layer.cpp
  71. 2
      modules/dnn/src/layers/fully_connected_layer.cpp
  72. 2
      modules/dnn/src/layers/lrn_layer.cpp
  73. 4
      modules/dnn/src/layers/max_unpooling_layer.cpp
  74. 2
      modules/dnn/src/layers/mvn_layer.cpp
  75. 6
      modules/dnn/src/layers/nary_eltwise_layers.cpp
  76. 8
      modules/dnn/src/layers/normalize_bbox_layer.cpp
  77. 9
      modules/dnn/src/layers/pooling_layer.cpp
  78. 4
      modules/dnn/src/layers/proposal_layer.cpp
  79. 4
      modules/dnn/src/layers/region_layer.cpp
  80. 2
      modules/dnn/src/layers/resize_layer.cpp
  81. 18
      modules/dnn/src/layers/scale_layer.cpp
  82. 2
      modules/dnn/src/layers/slice_layer.cpp
  83. 2
      modules/dnn/src/layers/softmax_layer.cpp
  84. 2
      modules/dnn/src/net_impl_fuse.cpp
  85. 65
      modules/dnn/src/net_openvino.cpp
  86. 1
      modules/dnn/src/op_halide.cpp
  87. 38
      modules/dnn/src/opencl/gemm_buffer.cl
  88. 53
      modules/dnn/src/tengine4dnn/include/tengine_graph_convolution.hpp
  89. 370
      modules/dnn/src/tengine4dnn/src/tengine_graph_convolution.cpp
  90. 4
      modules/dnn/test/test_backends.cpp
  91. 8
      modules/dnn/test/test_caffe_importer.cpp
  92. 29
      modules/dnn/test/test_halide_layers.cpp
  93. 6
      modules/dnn/test/test_int8_layers.cpp
  94. 22
      modules/dnn/test/test_misc.cpp
  95. 10
      modules/dnn/test/test_model.cpp
  96. 14
      modules/dnn/test/test_tflite_importer.cpp
  97. 135
      modules/features2d/3rdparty/mscr/chi_table.h
  98. 28
      modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt
  99. 2
      modules/features2d/CMakeLists.txt
  100. 118
      modules/features2d/src/mser.cpp
  101. Some files were not shown because too many files have changed in this diff Show More

@ -1,80 +0,0 @@
# COPYRIGHT
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# License); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2020, OPEN AI LAB
# Author: qtang@openailab.com or https://github.com/BUG1989
# qli@openailab.com
# sqfu@openailab.com
SET(TENGINE_COMMIT_VERSION "e89cf8870de2ff0a80cfe626c0b52b2a16fb302e")
SET(OCV_TENGINE_DIR "${OpenCV_BINARY_DIR}/3rdparty/libtengine")
SET(OCV_TENGINE_SOURCE_PATH "${OCV_TENGINE_DIR}/Tengine-${TENGINE_COMMIT_VERSION}")
IF(EXISTS "${OCV_TENGINE_SOURCE_PATH}")
MESSAGE(STATUS "Tengine is exist already at: ${OCV_TENGINE_SOURCE_PATH}")
SET(Tengine_FOUND ON)
SET(BUILD_TENGINE ON)
ELSE()
SET(OCV_TENGINE_FILENAME "${TENGINE_COMMIT_VERSION}.zip")#name
SET(OCV_TENGINE_URL "https://github.com/OAID/Tengine/archive/") #url
SET(tengine_md5sum 23f61ebb1dd419f1207d8876496289c5) #md5sum
ocv_download(FILENAME ${OCV_TENGINE_FILENAME}
HASH ${tengine_md5sum}
URL
"${OPENCV_TENGINE_URL}"
"$ENV{OPENCV_TENGINE_URL}"
"${OCV_TENGINE_URL}"
DESTINATION_DIR "${OCV_TENGINE_DIR}"
ID TENGINE
STATUS res
UNPACK RELATIVE_URL)
if (NOT res)
MESSAGE(STATUS "TENGINE DOWNLOAD FAILED. Turning Tengine_FOUND off.")
SET(Tengine_FOUND OFF)
else ()
MESSAGE(STATUS "TENGINE DOWNLOAD success . ")
SET(Tengine_FOUND ON)
SET(BUILD_TENGINE ON)
endif()
ENDIF()
if(BUILD_TENGINE)
SET(HAVE_TENGINE 1)
if(NOT ANDROID)
# linux system
if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
SET(TENGINE_TOOLCHAIN_FLAG "-march=armv7-a")
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) ## AARCH64
SET(TENGINE_TOOLCHAIN_FLAG "-march=armv8-a")
endif()
endif()
SET(BUILT_IN_OPENCV ON) ## set for tengine compile discern .
SET(Tengine_INCLUDE_DIR "${OCV_TENGINE_SOURCE_PATH}/include" CACHE INTERNAL "")
if(EXISTS "${OCV_TENGINE_SOURCE_PATH}/CMakeLists.txt")
add_subdirectory("${OCV_TENGINE_SOURCE_PATH}" "${OCV_TENGINE_DIR}/build")
else()
message(WARNING "TENGINE: Missing 'CMakeLists.txt' in source code package: ${OCV_TENGINE_SOURCE_PATH}")
endif()
SET(Tengine_LIB "tengine" CACHE INTERNAL "")
endif()

@ -39,7 +39,9 @@ libspng Portable Network Graphics library.
libtiff Tag Image File Format (TIFF) Software
Copyright (c) 1988-1997 Sam Leffler
Copyright (c) 1991-1997 Silicon Graphics, Inc.
See libtiff home page http://www.libtiff.org/
See libtiff home page #1 http://www.simplesystems.org/libtiff/
#2 https://libtiff.gitlab.io/libtiff/
#3 http://libtiff.maptools.org/
for details and links to the source code
WITH_TIFF CMake option must be ON to add libtiff & zlib support to imgcodecs.

@ -463,9 +463,6 @@ OCV_OPTION(WITH_ANDROID_MEDIANDK "Use Android Media NDK for Video I/O (Android)"
OCV_OPTION(WITH_ANDROID_NATIVE_CAMERA "Use Android NDK for Camera I/O (Android)" (ANDROID_NATIVE_API_LEVEL GREATER 23)
VISIBLE_IF ANDROID
VERIFY HAVE_ANDROID_NATIVE_CAMERA)
OCV_OPTION(WITH_TENGINE "Include Arm Inference Tengine support" OFF
VISIBLE_IF (ARM OR AARCH64) AND (UNIX OR ANDROID) AND NOT IOS
VERIFY HAVE_TENGINE)
OCV_OPTION(WITH_ONNX "Include Microsoft ONNX Runtime support" OFF
VISIBLE_IF TRUE
VERIFY HAVE_ONNX)
@ -768,9 +765,6 @@ if(WITH_LAPACK)
endif()
include(cmake/OpenCVFindProtobuf.cmake)
include(cmake/OpenCVDetectFlatbuffers.cmake)
if(WITH_TENGINE)
include(cmake/OpenCVFindTengine.cmake)
endif()
if(WITH_TIMVX)
include(cmake/OpenCVFindTIMVX.cmake)
endif()
@ -1623,10 +1617,6 @@ if(WITH_VA OR HAVE_VA)
status(" VA:" HAVE_VA THEN "YES" ELSE NO)
endif()
if(WITH_TENGINE OR HAVE_TENGINE)
status(" Tengine:" HAVE_TENGINE THEN "YES (${TENGINE_LIBRARIES})" ELSE NO)
endif()
if(WITH_LAPACK OR HAVE_LAPACK)
status(" Lapack:" HAVE_LAPACK THEN "YES (${LAPACK_LIBRARIES} ${LAPACK_VERSION})" ELSE NO)
endif()
@ -1693,6 +1683,10 @@ else()
endif()
endif()
if(BUILD_opencv_dnn AND OPENCV_DNN_BACKEND_DEFAULT)
status(" Default DNN backend:" ${OPENCV_DNN_BACKEND_DEFAULT})
endif()
if(WITH_EIGEN OR HAVE_EIGEN)
status(" Eigen:" HAVE_EIGEN THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
endif()

@ -60,6 +60,7 @@ Created by: Puttemans Steven - April 2016
#include <fstream>
#include <iostream>
#include <sstream>
using namespace std;
using namespace cv;

@ -251,7 +251,7 @@ if(NOT ${found})
set(${include_path} "${_include_path}" CACHE INTERNAL "")
set(${include_dir} "${_include_dir}" CACHE PATH "Python include dir")
set(${include_dir2} "${_include_dir2}" CACHE PATH "Python include dir 2")
set(${packages_path} "${_packages_path}" CACHE PATH "Where to install the python packages.")
set(${packages_path} "${_packages_path}" CACHE STRING "Where to install the python packages.")
set(${numpy_include_dirs} ${_numpy_include_dirs} CACHE PATH "Path to numpy headers")
set(${numpy_version} "${_numpy_version}" CACHE INTERNAL "")
endif()

@ -1,78 +0,0 @@
# COPYRIGHT
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# License); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2020, OPEN AI LAB
# Author: qtang@openailab.com or https://github.com/BUG1989
#
# ----------------------------------------------------------------------------
# Path for Tengine binaries
# ----------------------------------------------------------------------------
set(OPENCV_LIBTENGINE_ROOT_DIR "" CACHE PATH "Path to TENGINE binaries installation")
IF(OPENCV_LIBTENGINE_ROOT_DIR AND NOT BUILD_TENGINE)
MESSAGE(STATUS "TENGINE:-- Use binaries at ${OPENCV_LIBTENGINE_ROOT_DIR}")
SET(Tengine_FOUND ON)
set(BUILD_TENGINE OFF)
SET(Tengine_INCLUDE_DIR "${OPENCV_LIBTENGINE_ROOT_DIR}/include" CACHE PATH "TENGINE include dir")
SET(Tengine_LIB "${OPENCV_LIBTENGINE_ROOT_DIR}/lib/libtengine.a" CACHE PATH "TENGINE library dir")
ELSE()
IF(ANDROID)
IF(OPENCV_TENGINE_FORCE_ANDROID)
# nothing, use Android
ELSEIF(OPENCV_TENGINE_SKIP_ANDROID)
set(Tengine_FOUND OFF)
set(HAVE_TENGINE FALSE)
return()
ELSEIF(NOT DEFINED ANDROID_NDK_REVISION)
MESSAGE(STATUS "Android NDK version Tengine not support: ANDROID_NDK_REVISION is not defined")
set(Tengine_FOUND OFF)
set(HAVE_TENGINE FALSE)
return()
ELSEIF(ANDROID_NDK_REVISION VERSION_LESS 14)
MESSAGE(STATUS "Android NDK version Tengine not support: ANDROID_NDK_REVISION=${ANDROID_NDK_REVISION}")
set(Tengine_FOUND OFF)
set(HAVE_TENGINE FALSE)
return()
ENDIF()
ENDIF()
MESSAGE(STATUS "TENGINE:-- Build Tengine from source code. ")
include("${OpenCV_SOURCE_DIR}/3rdparty/libtengine/tengine.cmake")
ENDIF()
IF(NOT Tengine_LIB)
SET(Tengine_FOUND OFF)
MESSAGE(STATUS "#### Could not find Tengine lib. Turning Tengine_FOUND off")
ENDIF()
IF (Tengine_FOUND)
MESSAGE(STATUS "Found Tengine include: ${Tengine_INCLUDE_DIR}")
MESSAGE(STATUS "Found Tengine libraries: ${Tengine_LIB}")
set(HAVE_TENGINE 1)
set(TENGINE_LIBRARIES ${Tengine_LIB})
set(TENGINE_INCLUDE_DIRS ${Tengine_INCLUDE_DIR})
ENDIF (Tengine_FOUND)
MARK_AS_ADVANCED(
Tengine_INCLUDE_DIR
Tengine_LIB
)

@ -1,15 +1,12 @@
# Gitlab-style mirror
# CMake scripts look for opencv/opencv_3rdparty,
# OAID/Tengine, 01org/tbb(oneAPI/oneTBB), opencv/ade
# 01org/tbb(oneAPI/oneTBB), opencv/ade
# from OPENCV_DOWNLOAD_MIRROR
ocv_update(OPENCV_DOWNLOAD_MIRROR_URL "")
######
# Download via commit id
######
# Tengine
ocv_update(TENGINE_PKG_MD5_CUSTOM "")
ocv_update(TENGINE_PKG_MD5_ORIGINAL 23f61ebb1dd419f1207d8876496289c5) # same as tengine_md5sum for TENGINE commit of e89cf8870de2ff0a80cfe626c0b52b2a16fb302e
# NVIDIA_OPTICAL_FLOW
ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_GITCODE "")
ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_ORIGINAL a73cd48b18dcc0cc8933b30796074191)
@ -77,7 +74,7 @@ else()
ocv_download_url_custom_usercontent(opencv)
elseif(DL_ID STREQUAL "wechat_qrcode")
ocv_download_url_gitcode_usercontent(WeChatCV)
elseif((DL_ID STREQUAL "TENGINE") OR (DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
elseif((DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
ocv_download_url_custom_archive_commit_id()
elseif(DL_ID STREQUAL "TBB")
ocv_download_url_custom_archive_release()

@ -1,9 +1,6 @@
######
# Download via commit id
######
# Tengine
ocv_update(TENGINE_PKG_MD5_GITCODE 1b5908632b557275cd6e85b0c03f9690)
ocv_update(TENGINE_PKG_MD5_ORIGINAL 23f61ebb1dd419f1207d8876496289c5) # same as tengine_md5sum for TENGINE commit of e89cf8870de2ff0a80cfe626c0b52b2a16fb302e
# NVIDIA_OPTICAL_FLOW
ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_GITCODE 8d5b7eeb24d6ca9c6bcfdff4196d5b47)
ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_ORIGINAL a73cd48b18dcc0cc8933b30796074191)
@ -74,7 +71,7 @@ if((DL_ID STREQUAL "FFMPEG") OR (DL_ID STREQUAL "IPPICV") OR (DL_ID STREQUAL "da
ocv_download_url_gitcode_usercontent(opencv)
elseif(DL_ID STREQUAL "wechat_qrcode")
ocv_download_url_gitcode_usercontent(mirrors/WeChatCV)
elseif((DL_ID STREQUAL "TENGINE") OR (DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
elseif((DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
ocv_download_url_gitcode_archive_commit_id()
elseif(DL_ID STREQUAL "TBB")
ocv_download_url_gitcode_archive_release(OPENCV_TBB_SUBDIR)

@ -224,6 +224,16 @@ Following options can be used to produce special builds with instrumentation or
@see [Link time optimization](https://gcc.gnu.org/wiki/LinkTimeOptimization)
@see [ThinLTO](https://clang.llvm.org/docs/ThinLTO.html)
## Enable IPP optimization
Following options can be used to enables IPP optimizations for each functions but increases the size of the opencv library. All options are disabled by default.
| Option | Functions | + roughly size |
| -------| --------- | -------------- |
| `OPENCV_IPP_GAUSSIAN_BLUR` | GaussianBlur() | +8Mb |
| `OPENCV_IPP_MEAN` | mean() / meanStdDev() | +0.2Mb |
| `OPENCV_IPP_MINMAX` | minMaxLoc() / minMaxIdx() | +0.2Mb |
| `OPENCV_IPP_SUM` | sum() | +0.1Mb |
# Functional features and dependencies {#tutorial_config_reference_func}
@ -484,7 +494,6 @@ OpenCV have own DNN inference module which have own build-in engine, but can als
| `OPENCV_DNN_CUDA` | _OFF_ | Enable CUDA backend. [CUDA](https://en.wikipedia.org/wiki/CUDA), CUBLAS and [CUDNN](https://developer.nvidia.com/cudnn) must be installed. |
| `WITH_HALIDE` | _OFF_ | Use experimental [Halide](https://en.wikipedia.org/wiki/Halide_(programming_language)) backend which can generate optimized code for dnn-layers at runtime. Halide must be installed. |
| `WITH_VULKAN` | _OFF_ | Enable experimental [Vulkan](https://en.wikipedia.org/wiki/Vulkan_(API)) backend. Does not require additional dependencies, but can use external Vulkan headers (`VULKAN_INCLUDE_DIRS`). |
| `WITH_TENGINE` | _OFF_ | Enable experimental [Tengine](https://github.com/OAID/Tengine) backend for ARM CPUs. Tengine library must be installed. |
# Installation layout {#tutorial_config_reference_install}
@ -566,6 +575,7 @@ Following options can be used to change installation layout for common scenarios
| ------ | ------- | ----------- |
| `OPENCV_ENABLE_NONFREE` | _OFF_ | Some algorithms included in the library are known to be protected by patents and are disabled by default. |
| `OPENCV_FORCE_3RDPARTY_BUILD`| _OFF_ | Enable all `BUILD_` options at once. |
| `OPENCV_IPP_ENABLE_ALL`| _OFF_ | Enable all `OPENCV_IPP_` options at once. |
| `ENABLE_CCACHE` | _ON_ (on Unix-like platforms) | Enable [ccache](https://en.wikipedia.org/wiki/Ccache) auto-detection. This tool wraps compiler calls and caches results, can significantly improve re-compilation time. |
| `ENABLE_PRECOMPILED_HEADERS` | _ON_ (for MSVC) | Enable precompiled headers support. Improves build time. |
| `BUILD_DOCS` | _OFF_ | Enable documentation build (_doxygen_, _doxygen_cpp_, _doxygen_python_, _doxygen_javadoc_ targets). [Doxygen](http://www.doxygen.org/index.html) must be installed for C++ documentation build. Python and [BeautifulSoup4](https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)) must be installed for Python documentation build. Javadoc and Ant must be installed for Java documentation build (part of Java SDK). |

@ -1,5 +1,6 @@
#include "precomp.hpp"
#include "ap3p.h"
#include "polynom_solver.h"
#include <cmath>
#include <complex>
@ -7,67 +8,11 @@
static inline double cbrt(double x) { return (double)cv::cubeRoot((float)x); };
#endif
namespace cv {
static
void solveQuartic(const double *factors, double *realRoots)
{
const double &a4 = factors[0];
const double &a3 = factors[1];
const double &a2 = factors[2];
const double &a1 = factors[3];
const double &a0 = factors[4];
double a4_2 = a4 * a4;
double a3_2 = a3 * a3;
double a4_3 = a4_2 * a4;
double a2a4 = a2 * a4;
double p4 = (8 * a2a4 - 3 * a3_2) / (8 * a4_2);
double q4 = (a3_2 * a3 - 4 * a2a4 * a3 + 8 * a1 * a4_2) / (8 * a4_3);
double r4 = (256 * a0 * a4_3 - 3 * (a3_2 * a3_2) - 64 * a1 * a3 * a4_2 + 16 * a2a4 * a3_2) / (256 * (a4_3 * a4));
double p3 = ((p4 * p4) / 12 + r4) / 3; // /=-3
double q3 = (72 * r4 * p4 - 2 * p4 * p4 * p4 - 27 * q4 * q4) / 432; // /=2
double t; // *=2
std::complex<double> w;
if (q3 >= 0)
w = -std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
else
w = std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
if (w.imag() == 0.0) {
w.real(std::cbrt(w.real()));
t = 2.0 * (w.real() + p3 / w.real());
} else {
w = pow(w, 1.0 / 3);
t = 4.0 * w.real();
}
std::complex<double> sqrt_2m = sqrt(static_cast<std::complex<double> >(-2 * p4 / 3 + t));
double B_4A = -a3 / (4 * a4);
double complex1 = 4 * p4 / 3 + t;
#if defined(__clang__) && defined(__arm__) && (__clang_major__ == 3 || __clang_major__ == 4) && !defined(__ANDROID__)
// details: https://github.com/opencv/opencv/issues/11135
// details: https://github.com/opencv/opencv/issues/11056
std::complex<double> complex2 = 2 * q4;
complex2 = std::complex<double>(complex2.real() / sqrt_2m.real(), 0);
#else
std::complex<double> complex2 = 2 * q4 / sqrt_2m;
#endif
double sqrt_2m_rh = sqrt_2m.real() / 2;
double sqrt1 = sqrt(-(complex1 + complex2)).real() / 2;
realRoots[0] = B_4A + sqrt_2m_rh + sqrt1;
realRoots[1] = B_4A + sqrt_2m_rh - sqrt1;
double sqrt2 = sqrt(-(complex1 - complex2)).real() / 2;
realRoots[2] = B_4A - sqrt_2m_rh + sqrt2;
realRoots[3] = B_4A - sqrt_2m_rh - sqrt2;
}
static void polishQuarticRoots(const double *coeffs, double *roots) {
namespace {
void polishQuarticRoots(const double *coeffs, double *roots, int nb_roots) {
const int iterations = 2;
for (int i = 0; i < iterations; ++i) {
for (int j = 0; j < 4; ++j) {
for (int j = 0; j < nb_roots; ++j) {
double error =
(((coeffs[0] * roots[j] + coeffs[1]) * roots[j] + coeffs[2]) * roots[j] + coeffs[3]) * roots[j] +
coeffs[4];
@ -124,7 +69,9 @@ inline void mat_mult(const double a[3][3], const double b[3][3], double result[3
result[2][1] = a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1];
result[2][2] = a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2];
}
}
namespace cv {
void ap3p::init_inverse_parameters() {
inv_fx = 1. / fx;
inv_fy = 1. / fy;
@ -228,8 +175,9 @@ int ap3p::computePoses(const double featureVectors[3][4],
2 * (g6 * g7 - g1 * g2 - g3 * g4),
g7 * g7 - g2 * g2 - g4 * g4};
double s[4];
solveQuartic(coeffs, s);
polishQuarticRoots(coeffs, s);
int nb_roots = solve_deg4(coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4],
s[0], s[1], s[2], s[3]);
polishQuarticRoots(coeffs, s, nb_roots);
double temp[3];
vect_cross(k1, nl, temp);
@ -255,7 +203,7 @@ int ap3p::computePoses(const double featureVectors[3][4],
double reproj_errors[4];
int nb_solutions = 0;
for (int i = 0; i < 4; ++i) {
for (int i = 0; i < nb_roots; ++i) {
double ctheta1p = s[i];
if (abs(ctheta1p) > 1)
continue;

@ -239,7 +239,8 @@ public:
// (5) Compute the left eigenvectors of the action matrix
Eigen::EigenSolver<Eigen::Matrix<double, 10, 10>> eigensolver(action_mat_eig);
const Eigen::VectorXcd &eigenvalues = eigensolver.eigenvalues();
const auto * const eig_vecs_ = (double *) eigensolver.eigenvectors().real().data();
const Eigen::MatrixXcd eigenvectors = eigensolver.eigenvectors();
const auto * const eig_vecs_ = (double *) eigenvectors.data();
#else
Matx<double, 10, 10> A = constraint_mat.colRange(0, 10),
B = constraint_mat.colRange(10, 20), eliminated_mat;

@ -115,8 +115,8 @@ TEST_P(EstimateAffine2D, testNPoints)
EXPECT_NEAR(0., cvtest::norm(aff_est, aff, NORM_INF), 1e-4);
bool inliers_good = count(inliers.begin(), inliers.end(), 1) == m &&
m == accumulate(inliers.begin(), inliers.begin() + m, 0);
bool inliers_good = std::count(inliers.begin(), inliers.end(), 1) == m &&
m == std::accumulate(inliers.begin(), inliers.begin() + m, 0);
EXPECT_TRUE(inliers_good);
}

@ -161,8 +161,8 @@ bool CV_Affine3D_EstTest::testNPoints()
return false;
}
bool outl_good = count(outl.begin(), outl.end(), 1) == m &&
m == accumulate(outl.begin(), outl.begin() + m, 0);
bool outl_good = std::count(outl.begin(), outl.end(), 1) == m &&
m == std::accumulate(outl.begin(), outl.begin() + m, 0);
if (!outl_good)
{

@ -125,8 +125,8 @@ TEST_P(EstimateAffinePartial2D, testNPoints)
EXPECT_NEAR(0., cvtest::norm(aff_est, aff, NORM_INF), 1e-4);
bool inliers_good = count(inliers.begin(), inliers.end(), 1) == m &&
m == accumulate(inliers.begin(), inliers.begin() + m, 0);
bool inliers_good = std::count(inliers.begin(), inliers.end(), 1) == m &&
m == std::accumulate(inliers.begin(), inliers.begin() + m, 0);
EXPECT_TRUE(inliers_good);
}

@ -41,6 +41,7 @@
//M*/
#include "test_precomp.hpp"
#include "opencv2/core/utils/logger.hpp"
namespace opencv_test { namespace {
@ -2259,4 +2260,65 @@ TEST(Calib3d_SolvePnP, inputShape)
}
}
bool hasNan(const cv::Mat& mat)
{
bool has = false;
if (mat.type() == CV_32F)
{
for(int i = 0; i < static_cast<int>(mat.total()); i++)
has |= cvIsNaN(mat.at<float>(i)) != 0;
}
else if (mat.type() == CV_64F)
{
for(int i = 0; i < static_cast<int>(mat.total()); i++)
has |= cvIsNaN(mat.at<double>(i)) != 0;
}
else
{
has = true;
CV_LOG_ERROR(NULL, "check hasNan called with unsupported type!");
}
return has;
}
TEST(AP3P, ctheta1p_nan_23607)
{
// the task is not well defined and may not converge (empty R, t) or should
// converge to some non-NaN solution
const std::array<cv::Point2d, 3> cameraPts = {
cv::Point2d{0.042784865945577621, 0.59844839572906494},
cv::Point2d{-0.028428621590137482, 0.60354739427566528},
cv::Point2d{0.0046037044376134872, 0.70674681663513184}
};
const std::array<cv::Point3d, 3> modelPts = {
cv::Point3d{-0.043258000165224075, 0.020459245890378952, -0.0069921980611979961},
cv::Point3d{-0.045648999512195587, 0.0029820732306689024, 0.0079000638797879219},
cv::Point3d{-0.043276999145746231, -0.013622495345771313, 0.0080113131552934647}
};
std::vector<Mat> R, t;
solveP3P(modelPts, cameraPts, Mat::eye(3, 3, CV_64F), Mat(), R, t, SOLVEPNP_AP3P);
EXPECT_EQ(R.size(), 2ul);
EXPECT_EQ(t.size(), 2ul);
// Try apply rvec and tvec to get model points from camera points.
Mat pts = Mat(modelPts).reshape(1, 3);
Mat expected = Mat(cameraPts).reshape(1, 3);
for (size_t i = 0; i < R.size(); ++i) {
EXPECT_TRUE(!hasNan(R[i]));
EXPECT_TRUE(!hasNan(t[i]));
Mat transform;
cv::Rodrigues(R[i], transform);
Mat res = pts * transform.t();
for (int j = 0; j < 3; ++j) {
res.row(j) += t[i].reshape(1, 1);
res.row(j) /= res.row(j).at<double>(2);
}
EXPECT_LE(cvtest::norm(res.colRange(0, 2), expected, NORM_INF), 3e-16);
}
}
}} // namespace

@ -91,8 +91,8 @@ TEST(Calib3d_EstimateTranslation3D, testNPoints)
<< "aff est: " << trans_est << endl
<< "aff ref: " << trans;
bool outl_good = count(outl.begin(), outl.end(), 1) == m &&
m == accumulate(outl.begin(), outl.begin() + m, 0);
bool outl_good = std::count(outl.begin(), outl.end(), 1) == m &&
m == std::accumulate(outl.begin(), outl.begin() + m, 0);
EXPECT_TRUE(outl_good);
}

@ -60,6 +60,26 @@ if(CV_TRACE AND HAVE_ITT)
add_definitions(-DOPENCV_WITH_ITT=1)
endif()
# https://github.com/opencv/opencv/issues/24145
if(HAVE_IPP)
OCV_OPTION(OPENCV_IPP_ENABLE_ALL "Enable all OPENCV_IPP_ options at once" OFF)
OCV_OPTION(OPENCV_IPP_MEAN "Enable IPP optimizations for mean (+200Kb in binary size)" OPENCV_IPP_ENABLE_ALL)
OCV_OPTION(OPENCV_IPP_MINMAX "Enable IPP optimizations for minMaxLoc/minMaxIdx (+200Kb in binary size)" OPENCV_IPP_ENABLE_ALL)
OCV_OPTION(OPENCV_IPP_SUM "Enable IPP optimizations for sum (+100Kb in binary size)" OPENCV_IPP_ENABLE_ALL)
if(OPENCV_IPP_MEAN)
ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/mean.dispatch.cpp "OPENCV_IPP_MEAN=1")
endif()
if(OPENCV_IPP_MINMAX)
ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/minmax.cpp "OPENCV_IPP_MINMAX=1")
endif()
if(OPENCV_IPP_SUM)
ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/sum.dispatch.cpp "OPENCV_IPP_SUM=1")
endif()
endif()
file(GLOB lib_cuda_hdrs
"${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cuda/*.hpp"
"${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cuda/*.h")

@ -1118,6 +1118,13 @@ CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
*/
CV_EXPORTS_W void flipND(InputArray src, OutputArray dst, int axis);
/** @brief Broadcast the given Mat to the given shape.
* @param src input array
* @param shape target shape. Should be a list of CV_32S numbers. Note that negative values are not supported.
* @param dst output array that has the given shape
*/
CV_EXPORTS_W void broadcast(InputArray src, InputArray shape, OutputArray dst);
enum RotateFlags {
ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
ROTATE_180 = 1, //!<Rotate 180 degrees clockwise

@ -198,16 +198,32 @@ public:
CV_WRAP GpuMat clone() const;
//! copies the GpuMat content to device memory (Blocking call)
CV_WRAP void copyTo(OutputArray dst) const;
void copyTo(OutputArray dst) const;
//! bindings overload which copies the GpuMat content to device memory (Blocking call)
CV_WRAP void copyTo(CV_OUT GpuMat& dst) const {
copyTo(static_cast<OutputArray>(dst));
}
//! copies the GpuMat content to device memory (Non-Blocking call)
CV_WRAP void copyTo(OutputArray dst, Stream& stream) const;
void copyTo(OutputArray dst, Stream& stream) const;
//! bindings overload which copies the GpuMat content to device memory (Non-Blocking call)
CV_WRAP void copyTo(CV_OUT GpuMat& dst, Stream& stream) const {
copyTo(static_cast<OutputArray>(dst), stream);
}
//! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
CV_WRAP void copyTo(OutputArray dst, InputArray mask) const;
void copyTo(OutputArray dst, InputArray mask) const;
//! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask) const {
copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask));
}
//! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
CV_WRAP void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
//! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask, Stream& stream) const {
copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask), stream);
}
//! sets some of the GpuMat elements to s (Blocking call)
CV_WRAP GpuMat& setTo(Scalar s);
@ -222,19 +238,31 @@ public:
CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
//! converts GpuMat to another datatype (Blocking call)
CV_WRAP void convertTo(OutputArray dst, int rtype) const;
void convertTo(OutputArray dst, int rtype) const;
//! converts GpuMat to another datatype (Non-Blocking call)
CV_WRAP void convertTo(OutputArray dst, int rtype, Stream& stream) const;
void convertTo(OutputArray dst, int rtype, Stream& stream) const;
//! bindings overload which converts GpuMat to another datatype (Non-Blocking call)
CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, Stream& stream) const {
convertTo(static_cast<OutputArray>(dst), rtype, stream);
}
//! converts GpuMat to another datatype with scaling (Blocking call)
CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
//! bindings overload which converts GpuMat to another datatype with scaling(Blocking call)
CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha = 1.0, double beta = 0.0) const {
convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta);
}
//! converts GpuMat to another datatype with scaling (Non-Blocking call)
CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
//! converts GpuMat to another datatype with scaling (Non-Blocking call)
CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
//! bindings overload which converts GpuMat to another datatype with scaling (Non-Blocking call)
CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha, double beta, Stream& stream) const {
convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta, stream);
}
CV_WRAP void assignTo(GpuMat& m, int type = -1) const;

@ -2014,12 +2014,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
inline v_int32x4 v_round(const v_float64x2& a)
{
static const int32x2_t zero = vdup_n_s32(0);
return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
}
inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
{
return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
}
inline v_int32x4 v_floor(const v_float64x2& a)

@ -924,6 +924,9 @@ inline scalartype v_reduce_sum(const _Tpvec& a) \
return (scalartype)v_get0(res); \
}
OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
#if CV_SIMD_SCALABLE_64F
OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
#endif
#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
inline scalartype v_reduce_##func(const _Tpvec& a) \

@ -3,6 +3,7 @@
// of this distribution and at http://opencv.org/license.html.
#include <iostream>
#include <sstream>
#include <opencv2/core.hpp>
#include <opencv2/core/ocl.hpp>

@ -12,7 +12,8 @@
# elif defined WINRT || defined _WIN32_WCE
/* not supported */
# elif defined __ANDROID__ || defined __linux__ || defined _WIN32 || \
defined __FreeBSD__ || defined __bsdi__ || defined __HAIKU__
defined __FreeBSD__ || defined __bsdi__ || defined __HAIKU__ || \
defined __GNU__
# define OPENCV_HAVE_FILESYSTEM_SUPPORT 1
# elif defined(__APPLE__)
# include <TargetConditionals.h>

@ -5,8 +5,35 @@ namespace opencv_test
{
using namespace perf;
using BroadcastTest = perf::TestBaseWithParam<std::tuple<std::vector<int>, perf::MatType, std::vector<int>>>;
typedef Size_MatType BinaryOpTest;
PERF_TEST_P_(BroadcastTest, basic)
{
std::vector<int> shape_src = get<0>(GetParam());
int dt_type = get<1>(GetParam());
std::vector<int> shape_dst = get<2>(GetParam());
cv::Mat src(static_cast<int>(shape_src.size()), shape_src.data(), dt_type);
cv::Mat dst(static_cast<int>(shape_dst.size()), shape_dst.data(), dt_type);
cv::randu(src, -1.f, 1.f);
TEST_CYCLE() cv::broadcast(src, shape_dst, dst);
SANITY_CHECK_NOTHING();
}
INSTANTIATE_TEST_CASE_P(/*nothing*/ , BroadcastTest,
testing::Combine(
testing::Values(std::vector<int>{1, 100, 800},
std::vector<int>{10, 1, 800},
std::vector<int>{10, 100, 1}),
testing::Values(CV_32FC1),
testing::Values(std::vector<int>{10, 100, 800})
)
);
PERF_TEST_P_(BinaryOpTest, min)
{
Size sz = get<0>(GetParam());

@ -1335,7 +1335,7 @@ struct InRange_SIMD
}
};
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
template <>
struct InRange_SIMD<uchar>
@ -1344,7 +1344,7 @@ struct InRange_SIMD<uchar>
uchar * dst, int len) const
{
int x = 0;
const int width = v_uint8::nlanes;
const int width = VTraits<v_uint8>::vlanes();
for (; x <= len - width; x += width)
{
@ -1352,7 +1352,7 @@ struct InRange_SIMD<uchar>
v_uint8 low = vx_load(src2 + x);
v_uint8 high = vx_load(src3 + x);
v_store(dst + x, (values >= low) & (high >= values));
v_store(dst + x, v_and(v_ge(values, low), v_ge(high, values)));
}
vx_cleanup();
return x;
@ -1366,7 +1366,7 @@ struct InRange_SIMD<schar>
uchar * dst, int len) const
{
int x = 0;
const int width = v_int8::nlanes;
const int width = VTraits<v_int8>::vlanes();
for (; x <= len - width; x += width)
{
@ -1374,7 +1374,7 @@ struct InRange_SIMD<schar>
v_int8 low = vx_load(src2 + x);
v_int8 high = vx_load(src3 + x);
v_store((schar*)(dst + x), (values >= low) & (high >= values));
v_store((schar*)(dst + x), v_and(v_ge(values, low), v_ge(high, values)));
}
vx_cleanup();
return x;
@ -1388,7 +1388,7 @@ struct InRange_SIMD<ushort>
uchar * dst, int len) const
{
int x = 0;
const int width = v_uint16::nlanes * 2;
const int width = VTraits<v_uint16>::vlanes() * 2;
for (; x <= len - width; x += width)
{
@ -1396,11 +1396,11 @@ struct InRange_SIMD<ushort>
v_uint16 low1 = vx_load(src2 + x);
v_uint16 high1 = vx_load(src3 + x);
v_uint16 values2 = vx_load(src1 + x + v_uint16::nlanes);
v_uint16 low2 = vx_load(src2 + x + v_uint16::nlanes);
v_uint16 high2 = vx_load(src3 + x + v_uint16::nlanes);
v_uint16 values2 = vx_load(src1 + x + VTraits<v_uint16>::vlanes());
v_uint16 low2 = vx_load(src2 + x + VTraits<v_uint16>::vlanes());
v_uint16 high2 = vx_load(src3 + x + VTraits<v_uint16>::vlanes());
v_store(dst + x, v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
v_store(dst + x, v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
}
vx_cleanup();
return x;
@ -1414,7 +1414,7 @@ struct InRange_SIMD<short>
uchar * dst, int len) const
{
int x = 0;
const int width = (int)v_int16::nlanes * 2;
const int width = (int)VTraits<v_int16>::vlanes() * 2;
for (; x <= len - width; x += width)
{
@ -1422,11 +1422,11 @@ struct InRange_SIMD<short>
v_int16 low1 = vx_load(src2 + x);
v_int16 high1 = vx_load(src3 + x);
v_int16 values2 = vx_load(src1 + x + v_int16::nlanes);
v_int16 low2 = vx_load(src2 + x + v_int16::nlanes);
v_int16 high2 = vx_load(src3 + x + v_int16::nlanes);
v_int16 values2 = vx_load(src1 + x + VTraits<v_int16>::vlanes());
v_int16 low2 = vx_load(src2 + x + VTraits<v_int16>::vlanes());
v_int16 high2 = vx_load(src3 + x + VTraits<v_int16>::vlanes());
v_store((schar*)(dst + x), v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
v_store((schar*)(dst + x), v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
}
vx_cleanup();
return x;
@ -1440,7 +1440,7 @@ struct InRange_SIMD<int>
uchar * dst, int len) const
{
int x = 0;
const int width = (int)v_int32::nlanes * 2;
const int width = (int)VTraits<v_int32>::vlanes() * 2;
for (; x <= len - width; x += width)
{
@ -1448,11 +1448,11 @@ struct InRange_SIMD<int>
v_int32 low1 = vx_load(src2 + x);
v_int32 high1 = vx_load(src3 + x);
v_int32 values2 = vx_load(src1 + x + v_int32::nlanes);
v_int32 low2 = vx_load(src2 + x + v_int32::nlanes);
v_int32 high2 = vx_load(src3 + x + v_int32::nlanes);
v_int32 values2 = vx_load(src1 + x + VTraits<v_int32>::vlanes());
v_int32 low2 = vx_load(src2 + x + VTraits<v_int32>::vlanes());
v_int32 high2 = vx_load(src3 + x + VTraits<v_int32>::vlanes());
v_pack_store(dst + x, v_reinterpret_as_u16(v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2))));
v_pack_store(dst + x, v_reinterpret_as_u16(v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2)))));
}
vx_cleanup();
return x;
@ -1466,7 +1466,7 @@ struct InRange_SIMD<float>
uchar * dst, int len) const
{
int x = 0;
const int width = (int)v_float32::nlanes * 2;
const int width = (int)VTraits<v_float32>::vlanes() * 2;
for (; x <= len - width; x += width)
{
@ -1474,12 +1474,12 @@ struct InRange_SIMD<float>
v_float32 low1 = vx_load(src2 + x);
v_float32 high1 = vx_load(src3 + x);
v_float32 values2 = vx_load(src1 + x + v_float32::nlanes);
v_float32 low2 = vx_load(src2 + x + v_float32::nlanes);
v_float32 high2 = vx_load(src3 + x + v_float32::nlanes);
v_float32 values2 = vx_load(src1 + x + VTraits<v_float32>::vlanes());
v_float32 low2 = vx_load(src2 + x + VTraits<v_float32>::vlanes());
v_float32 high2 = vx_load(src3 + x + VTraits<v_float32>::vlanes());
v_pack_store(dst + x, v_pack(v_reinterpret_as_u32(values1 >= low1) & v_reinterpret_as_u32(high1 >= values1),
v_reinterpret_as_u32(values2 >= low2) & v_reinterpret_as_u32(high2 >= values2)));
v_pack_store(dst + x, v_pack(v_and(v_reinterpret_as_u32(v_ge(values1, low1)), v_reinterpret_as_u32(v_ge(high1, values1))),
v_and(v_reinterpret_as_u32(v_ge(values2, low2)), v_reinterpret_as_u32(v_ge(high2, values2)))));
}
vx_cleanup();
return x;

@ -215,7 +215,7 @@ template<typename T1, typename Tvec>
struct op_add
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a + b; }
{ return v_add(a, b); }
static inline T1 r(T1 a, T1 b)
{ return c_add(a, b); }
};
@ -225,7 +225,7 @@ template<typename T1, typename Tvec>
struct op_sub
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a - b; }
{ return v_sub(a, b); }
static inline T1 r(T1 a, T1 b)
{ return c_sub(a, b); }
};
@ -262,7 +262,7 @@ struct op_absdiff
template<>
struct op_absdiff<schar, v_int8>
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_int8 r(const v_int8& a, const v_int8& b)
{ return v_absdiffs(a, b); }
#endif
@ -272,7 +272,7 @@ struct op_absdiff<schar, v_int8>
template<>
struct op_absdiff<short, v_int16>
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_int16 r(const v_int16& a, const v_int16& b)
{ return v_absdiffs(a, b); }
#endif
@ -282,7 +282,7 @@ struct op_absdiff<short, v_int16>
template<>
struct op_absdiff<int, v_int32>
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_int32 r(const v_int32& a, const v_int32& b)
{ return v_reinterpret_as_s32(v_absdiff(a, b)); }
#endif
@ -295,7 +295,7 @@ template<typename T1, typename Tvec>
struct op_or
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a | b; }
{ return v_or(a, b); }
static inline T1 r(T1 a, T1 b)
{ return a | b; }
};
@ -303,7 +303,7 @@ template<typename T1, typename Tvec>
struct op_xor
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a ^ b; }
{ return v_xor(a, b); }
static inline T1 r(T1 a, T1 b)
{ return a ^ b; }
};
@ -311,7 +311,7 @@ template<typename T1, typename Tvec>
struct op_and
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a & b; }
{ return v_and(a, b); }
static inline T1 r(T1 a, T1 b)
{ return a & b; }
};
@ -320,14 +320,14 @@ struct op_not
{
// ignored b from loader level
static inline Tvec r(const Tvec& a)
{ return ~a; }
{ return v_not(a); }
static inline T1 r(T1 a, T1)
{ return ~a; }
};
//////////////////////////// Loaders /////////////////////////////////
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
struct bin_loader
@ -392,13 +392,13 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
{
typedef OP<T1, Tvec> op;
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
typedef bin_loader<OP, T1, Tvec> ldr;
enum {wide_step = Tvec::nlanes};
const int wide_step = VTraits<Tvec>::vlanes();
#if !CV_NEON && CV_SIMD_WIDTH == 16
enum {wide_step_l = wide_step * 2};
const int wide_step_l = wide_step * 2;
#else
enum {wide_step_l = wide_step};
const int wide_step_l = wide_step;
#endif
#endif // CV_SIMD
@ -410,7 +410,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
{
int x = 0;
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
#if !CV_NEON && !CV_MSA
if (is_aligned(src1, src2, dst))
{
@ -583,7 +583,7 @@ template<typename T1, typename Tvec>
struct op_cmplt
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a < b; }
{ return v_lt(a, b); }
static inline uchar r(T1 a, T1 b)
{ return (uchar)-(int)(a < b); }
};
@ -592,7 +592,7 @@ template<typename T1, typename Tvec>
struct op_cmple
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a <= b; }
{ return v_le(a, b); }
static inline uchar r(T1 a, T1 b)
{ return (uchar)-(int)(a <= b); }
};
@ -601,7 +601,7 @@ template<typename T1, typename Tvec>
struct op_cmpeq
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a == b; }
{ return v_eq(a, b); }
static inline uchar r(T1 a, T1 b)
{ return (uchar)-(int)(a == b); }
};
@ -610,14 +610,14 @@ template<typename T1, typename Tvec>
struct op_cmpne
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a != b; }
{ return v_ne(a, b); }
static inline uchar r(T1 a, T1 b)
{ return (uchar)-(int)(a != b); }
};
//////////////////////////// Loaders /////////////////////////////////
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
// todo: add support for RW alignment & stream
template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
struct cmp_loader_n
@ -642,10 +642,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
{
typedef OP<T1, Tvec> op;
enum {step = Tvec::nlanes};
static inline void l(const T1* src1, const T1* src2, uchar* dst)
{
const int step = VTraits<Tvec>::vlanes();
Tvec c0 = op::r(vx_load(src1), vx_load(src2));
Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
@ -656,10 +656,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
{
typedef OP<T1, Tvec> op;
enum {step = Tvec::nlanes};
static inline void l(const T1* src1, const T1* src2, uchar* dst)
{
const int step = VTraits<Tvec>::vlanes();
v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
@ -672,10 +672,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
{
typedef OP<T1, Tvec> op;
enum {step = Tvec::nlanes};
static inline void l(const T1* src1, const T1* src2, uchar* dst)
{
const int step = VTraits<Tvec>::vlanes();
v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
@ -697,9 +697,9 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
typedef OP<T1, Tvec> op;
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
enum {wide_step = Tvec::nlanes * sizeof(T1)};
const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
#endif // CV_SIMD
step1 /= sizeof(T1);
@ -709,7 +709,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
{
int x = 0;
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, src2 + x, dst + x);
@ -876,7 +876,7 @@ DEFINE_SIMD_ALL(cmp)
//////////////////////////// Loaders ///////////////////////////////
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
// todo: add support for RW alignment & stream
template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
struct scalar_loader_n
@ -1009,10 +1009,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
{
typedef OP<int, T2, v_int32> op;
enum {step = v_int32::nlanes};
static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
{
const int step = VTraits<v_int32>::vlanes();
v_int32 v_src1 = vx_load(src1);
v_int32 v_src2 = vx_load(src2);
v_int32 v_src1s = vx_load(src1 + step);
@ -1039,6 +1039,7 @@ struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
static inline void l(const int* src1, const T2* scalar, int* dst)
{
const int step = VTraits<v_int32>::vlanes();
v_int32 v_src1 = vx_load(src1);
v_int32 v_src1s = vx_load(src1 + step);
@ -1064,10 +1065,9 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
{
typedef OP<float, T2, v_float32> op;
enum {step = v_float32::nlanes};
static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
{
const int step = VTraits<v_float32>::vlanes();
v_float32 v_src1 = vx_load(src1);
v_float32 v_src2 = vx_load(src2);
v_float32 v_src1s = vx_load(src1 + step);
@ -1082,6 +1082,7 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
static inline void l(const float* src1, const T2* scalar, float* dst)
{
const int step = VTraits<v_float32>::vlanes();
v_float32 v_src1 = vx_load(src1);
v_float32 v_src1s = vx_load(src1 + step);
@ -1258,10 +1259,10 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
T1* dst, size_t step, int width, int height, const T2* scalar)
{
typedef OP<T1, T2, Tvec> op;
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
#endif // CV_SIMD
step1 /= sizeof(T1);
@ -1272,7 +1273,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
{
int x = 0;
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, src2 + x, scalar, dst + x);
@ -1304,10 +1305,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
{
typedef OP<T1, T2, Tvec> op;
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
#endif // CV_SIMD
step1 /= sizeof(T1);
@ -1317,7 +1318,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
{
int x = 0;
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
for (; x <= width - wide_step; x += wide_step)
{
ldr::l(src1 + x, scalar, dst + x);
@ -1424,7 +1425,7 @@ template<typename T1, typename Tvec>
struct op_mul
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a * b; }
{ return v_mul(a, b); }
static inline T1 r(T1 a, T1 b)
{ return saturate_cast<T1>(a * b); }
};
@ -1432,11 +1433,11 @@ struct op_mul
template<typename T1, typename T2, typename Tvec>
struct op_mul_scale
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
return v_scalar * a * b;
return v_mul(v_scalar , a , b);
}
#endif
static inline T1 r(T1 a, T1 b, const T2* scalar)
@ -1452,7 +1453,7 @@ struct op_mul_scale<double, double, v_float64>
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);
return v_scalar * a * b;
return v_mul(v_mul(v_scalar, a), b);
}
#endif
static inline double r(double a, double b, const double* scalar)
@ -1565,7 +1566,7 @@ template<typename T1, typename Tvec>
struct op_div_f
{
static inline Tvec r(const Tvec& a, const Tvec& b)
{ return a / b; }
{ return v_div(a, b); }
static inline T1 r(T1 a, T1 b)
{ return a / b; }
};
@ -1573,16 +1574,16 @@ struct op_div_f
template<typename T1, typename T2, typename Tvec>
struct op_div_scale
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
return a * v_scalar / b;
return v_div(v_mul(a, v_scalar), b);
}
static inline Tvec pre(const Tvec& denom, const Tvec& res)
{
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
return v_select(denom == v_zero, v_zero, res);
const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
return v_select(v_eq(denom, v_zero), v_zero, res);
}
#endif
static inline T1 r(T1 a, T1 denom, const T2* scalar)
@ -1595,11 +1596,11 @@ struct op_div_scale
template<>
struct op_div_scale<float, float, v_float32>
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
return a * v_scalar / b;
return v_div(v_mul(a, v_scalar), b);
}
#endif
static inline float r(float a, float denom, const float* scalar)
@ -1613,7 +1614,7 @@ struct op_div_scale<double, double, v_float64>
static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);
return a * v_scalar / b;
return v_div(v_mul(a, v_scalar), b);
}
#endif
static inline double r(double a, double denom, const double* scalar)
@ -1681,7 +1682,7 @@ DEFINE_SIMD_ALL(div, div_loop)
template<typename T1, typename T2, typename Tvec>
struct op_add_scale
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
{
const v_float32 v_alpha = vx_setall_f32(*scalar);
@ -1714,7 +1715,7 @@ struct op_add_scale<double, double, v_float64>
template<typename T1, typename T2, typename Tvec>
struct op_add_weighted
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
{
const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@ -1831,16 +1832,16 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
template<typename T1, typename T2, typename Tvec>
struct op_recip
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_float32 r(const v_float32& a, const T2* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
return v_scalar / a;
return v_div(v_scalar, a);
}
static inline Tvec pre(const Tvec& denom, const Tvec& res)
{
const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
return v_select(denom == v_zero, v_zero, res);
const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
return v_select(v_eq(denom, v_zero), v_zero, res);
}
#endif
static inline T1 r(T1 denom, const T2* scalar)
@ -1853,11 +1854,11 @@ struct op_recip
template<>
struct op_recip<float, float, v_float32>
{
#if CV_SIMD
#if CV_SIMD || CV_SIMD_SCALABLE
static inline v_float32 r(const v_float32& a, const float* scalar)
{
const v_float32 v_scalar = vx_setall_f32(*scalar);
return v_scalar / a;
return v_div(v_scalar, a);
}
#endif
static inline float r(float denom, const float* scalar)
@ -1871,7 +1872,7 @@ struct op_recip<double, double, v_float64>
static inline v_float64 r(const v_float64& a, const double* scalar)
{
const v_float64 v_scalar = vx_setall_f64(*scalar);
return v_scalar / a;
return v_div(v_scalar, a);
}
#endif
static inline double r(double denom, const double* scalar)

@ -4,6 +4,8 @@
#include "precomp.hpp"
#include <sstream>
#include "opencv2/core/check.hpp"
namespace cv {

@ -11,7 +11,7 @@
namespace cv
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
static inline void vx_load_as(const uchar* ptr, v_float32& a)
{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
@ -78,7 +78,7 @@ static inline void v_store_as(int64_t* ptr, const v_float32& a)
v_int64 ia_0, ia_1;
v_expand(ia, ia_0, ia_1);
v_store(ptr, ia_0);
v_store(ptr + v_int64::nlanes, ia_1);
v_store(ptr + VTraits<v_uint64>::vlanes(), ia_1);
}
static inline void v_store_as(uint64_t* ptr, const v_float32& a)
@ -88,7 +88,7 @@ static inline void v_store_as(uint64_t* ptr, const v_float32& a)
ia = v_max(ia, vx_setzero_s32());
v_expand(v_reinterpret_as_u32(ia), ia_0, ia_1);
v_store(ptr, ia_0);
v_store(ptr + v_int64::nlanes, ia_1);
v_store(ptr + VTraits<v_uint64>::vlanes(), ia_1);
}
static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b)
@ -104,7 +104,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b)
}
static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b)
{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }
static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b)
{
@ -118,7 +118,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b)
{ v_expand(vx_load(ptr), a, b); }
static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b)
{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }
static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b)
{
@ -147,7 +147,7 @@ static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b)
static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b)
{
a = vx_load(ptr);
b = vx_load(ptr + v_int32::nlanes);
b = vx_load(ptr + VTraits<v_int32>::vlanes());
}
static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b)
@ -184,14 +184,14 @@ static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b)
static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
{
v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes);
v_int32 ia = vx_load(ptr), ib = vx_load(ptr + VTraits<v_int32>::vlanes());
a = v_cvt_f32(ia);
b = v_cvt_f32(ib);
}
static inline void vx_load_pair_as(const int64_t* ptr, v_int32& a, v_int32& b)
{
const int int64_nlanes = v_int64::nlanes;
const int int64_nlanes = VTraits<v_uint64>::vlanes();
a = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
b = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
}
@ -199,7 +199,7 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_int32& a, v_int32& b)
static inline void vx_load_pair_as(const int64_t* ptr, v_uint64& a, v_uint64& b)
{
v_int64 z = vx_setzero_s64();
v_int64 ia = vx_load(ptr), ib = vx_load(ptr + v_int64::nlanes);
v_int64 ia = vx_load(ptr), ib = vx_load(ptr + VTraits<v_uint64>::vlanes());
ia &= (ia > z);
ib &= (ib > z);
a = v_reinterpret_as_u64(ia);
@ -208,7 +208,7 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_uint64& a, v_uint64& b)
static inline void vx_load_pair_as(const int64_t* ptr, v_uint32& a, v_uint32& b)
{
const int nlanes = v_int64::nlanes;
const int nlanes = VTraits<v_uint64>::vlanes();
v_int64 z = vx_setzero_s64();
v_int64 ia0 = vx_load(ptr), ia1 = vx_load(ptr + nlanes);
v_int64 ib0 = vx_load(ptr + nlanes*2), ib1 = vx_load(ptr + nlanes*3);
@ -222,8 +222,8 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_uint32& a, v_uint32& b)
static inline void vx_load_pair_as(const uint64_t* ptr, v_float32& a, v_float32& b)
{
const int nlanes = v_uint64::nlanes;
float buf[v_uint64::nlanes*4];
const int nlanes = VTraits<v_uint64>::vlanes();
float buf[VTraits<v_uint64>::max_nlanes*4];
for (int i = 0; i < nlanes*4; i++) {
buf[i] = (float)ptr[i];
}
@ -233,8 +233,8 @@ static inline void vx_load_pair_as(const uint64_t* ptr, v_float32& a, v_float32&
static inline void vx_load_pair_as(const int64_t* ptr, v_float32& a, v_float32& b)
{
const int nlanes = v_int64::nlanes;
float buf[v_int64::nlanes*4];
const int nlanes = VTraits<v_uint64>::vlanes();
float buf[VTraits<v_uint64>::max_nlanes*4];
for (int i = 0; i < nlanes*4; i++) {
buf[i] = (float)ptr[i];
}
@ -277,21 +277,21 @@ static inline void vx_load_pair_as(const int* ptr, v_uint32& a, v_uint32& b)
{
v_int32 z = vx_setzero_s32();
v_int32 ia = v_max(vx_load(ptr), z);
v_int32 ib = v_max(vx_load(ptr + v_int32::nlanes), z);
v_int32 ib = v_max(vx_load(ptr + VTraits<v_int32>::vlanes()), z);
a = v_reinterpret_as_u32(ia);
b = v_reinterpret_as_u32(ib);
}
static inline void vx_load_pair_as(const uint64_t* ptr, v_uint32& a, v_uint32& b)
{
const int int64_nlanes = v_int64::nlanes;
const int int64_nlanes = VTraits<v_uint64>::vlanes();
a = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
b = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
}
static inline void vx_load_pair_as(const uint64_t* ptr, v_int32& a, v_int32& b)
{
const int int64_nlanes = v_int64::nlanes;
const int int64_nlanes = VTraits<v_uint64>::vlanes();
v_uint32 ua = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
v_uint32 ub = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
a = v_reinterpret_as_s32(ua);
@ -299,37 +299,37 @@ static inline void vx_load_pair_as(const uint64_t* ptr, v_int32& a, v_int32& b)
}
static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_float32>::vlanes()); }
static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
{
a = vx_load_expand(ptr);
b = vx_load_expand(ptr + v_float32::nlanes);
b = vx_load_expand(ptr + VTraits<v_float32>::vlanes());
}
static inline void vx_load_pair_as(const bfloat16_t* ptr, v_float32& a, v_float32& b)
{
a = vx_load_expand(ptr);
b = vx_load_expand(ptr + v_float32::nlanes);
b = vx_load_expand(ptr + VTraits<v_float32>::vlanes());
}
static inline void vx_load_pair_as(const unsigned* ptr, v_uint32& a, v_uint32& b)
{
a = vx_load(ptr);
b = vx_load(ptr + v_uint32::nlanes);
b = vx_load(ptr + VTraits<v_uint32>::vlanes());
}
static inline void vx_load_pair_as(const unsigned* ptr, v_int32& a, v_int32& b)
{
a = v_reinterpret_as_s32(vx_load(ptr));
b = v_reinterpret_as_s32(vx_load(ptr + v_uint32::nlanes));
b = v_reinterpret_as_s32(vx_load(ptr + VTraits<v_uint32>::vlanes()));
}
static inline void vx_load_pair_as(const unsigned* ptr, v_float32& a, v_float32& b)
{
v_uint32 delta = vx_setall_u32(0x80000000U);
v_uint32 ua = vx_load(ptr);
v_uint32 ub = vx_load(ptr + v_uint32::nlanes);
v_uint32 ub = vx_load(ptr + VTraits<v_uint32>::vlanes());
v_uint32 mask_a = (ua >= delta) & delta, mask_b = (ub >= delta) & delta;
v_float32 fmask_a = v_cvt_f32(v_reinterpret_as_s32(mask_a)); // 0.f or (float)(-(1 << 31))
v_float32 fmask_b = v_cvt_f32(v_reinterpret_as_s32(mask_b)); // 0.f or (float)(-(1 << 31))
@ -353,7 +353,7 @@ static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16
}
static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b)
{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); }
{ v_store(ptr, a); v_store(ptr + VTraits<v_uint16>::vlanes(), b); }
static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b)
{ v_store(ptr, v_pack_u(a, b)); }
@ -362,7 +362,7 @@ static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16&
{ v_store(ptr, v_pack(a, b)); }
static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b)
{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); }
{ v_store(ptr, a); v_store(ptr + VTraits<v_int16>::vlanes(), b); }
static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b)
{ v_pack_u_store(ptr, v_pack(a, b)); }
@ -379,7 +379,7 @@ static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32&
static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
{
v_store(ptr, a);
v_store(ptr + v_int32::nlanes, b);
v_store(ptr + VTraits<v_int32>::vlanes(), b);
}
static inline void v_store_pair_as(int64_t* ptr, const v_int32& a, const v_int32& b)
@ -387,7 +387,7 @@ static inline void v_store_pair_as(int64_t* ptr, const v_int32& a, const v_int32
v_int64 q0, q1, q2, q3;
v_expand(a, q0, q1);
v_expand(b, q2, q3);
const int nlanes = v_int64::nlanes;
const int nlanes = VTraits<v_uint64>::vlanes();
v_store(ptr, q0);
v_store(ptr + nlanes, q1);
v_store(ptr + nlanes*2, q2);
@ -419,11 +419,11 @@ static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32
{
v_int32 ia = v_round(a), ib = v_round(b);
v_store(ptr, ia);
v_store(ptr + v_int32::nlanes, ib);
v_store(ptr + VTraits<v_int32>::vlanes(), ib);
}
static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
{ v_store(ptr, a); v_store(ptr + VTraits<v_float32>::vlanes(), b); }
static inline void v_store_pair_as(unsigned* ptr, const v_float32& a, const v_float32& b)
{
@ -431,7 +431,7 @@ static inline void v_store_pair_as(unsigned* ptr, const v_float32& a, const v_fl
v_int32 ia = v_max(v_round(a), z);
v_int32 ib = v_max(v_round(b), z);
v_store(ptr, v_reinterpret_as_u32(ia));
v_store(ptr + v_int32::nlanes, v_reinterpret_as_u32(ib));
v_store(ptr + VTraits<v_int32>::vlanes(), v_reinterpret_as_u32(ib));
}
static inline void v_store_pair_as(uchar* ptr, const v_uint32& a, const v_uint32& b)
@ -447,7 +447,7 @@ static inline void v_store_pair_as(ushort* ptr, const v_uint32& a, const v_uint3
static inline void v_store_pair_as(unsigned* ptr, const v_uint32& a, const v_uint32& b)
{
v_store(ptr, a);
v_store(ptr + v_uint32::nlanes, b);
v_store(ptr + VTraits<v_uint32>::vlanes(), b);
}
static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uint32& b)
@ -455,7 +455,7 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uin
v_uint64 q0, q1, q2, q3;
v_expand(a, q0, q1);
v_expand(b, q2, q3);
const int nlanes = v_uint64::nlanes;
const int nlanes = VTraits<v_uint64>::vlanes();
v_store(ptr, q0);
v_store(ptr + nlanes, q1);
v_store(ptr + nlanes*2, q2);
@ -465,28 +465,28 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uin
static inline void v_store_pair_as(uint64_t* ptr, const v_uint64& a, const v_uint64& b)
{
v_store(ptr, a);
v_store(ptr + v_uint64::nlanes, b);
v_store(ptr + VTraits<v_uint64>::vlanes(), b);
}
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
static inline void vx_load_as(const uint64_t* ptr, v_float32& a)
{
v_float64 a_0 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr)));
v_float64 a_1 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + v_uint64::nlanes)));
v_float64 a_1 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + VTraits<v_uint64>::vlanes())));
a = v_cvt_f32(a_0, a_1);
}
static inline void vx_load_as(const int64_t* ptr, v_float32& a)
{
v_float64 a_0 = v_cvt_f64(vx_load(ptr));
v_float64 a_1 = v_cvt_f64(vx_load(ptr + v_uint64::nlanes));
v_float64 a_1 = v_cvt_f64(vx_load(ptr + VTraits<v_uint64>::vlanes()));
a = v_cvt_f32(a_0, a_1);
}
static inline void vx_load_as(const double* ptr, v_float32& a)
{
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
a = v_cvt_f32(v0, v1);
}
@ -516,8 +516,8 @@ static inline void vx_load_pair_as(const bfloat16_t* ptr, v_float64& a, v_float6
static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
{
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
v_int32 iv0 = v_round(v0), iv1 = v_round(v1);
v_int32 iv2 = v_round(v2), iv3 = v_round(v3);
a = v_combine_low(iv0, iv1);
@ -526,15 +526,15 @@ static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
static inline void vx_load_pair_as(const uint64_t* ptr, v_float64& a, v_float64& b)
{
const int int64_nlanes = v_int64::nlanes;
const int int64_nlanes = VTraits<v_uint64>::vlanes();
a = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr)));
b = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + int64_nlanes)));
}
static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
{
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
a = v_cvt_f32(v0, v1);
b = v_cvt_f32(v2, v3);
}
@ -584,19 +584,19 @@ static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b)
static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b)
{
a = vx_load(ptr);
b = vx_load(ptr + v_float64::nlanes);
b = vx_load(ptr + VTraits<v_float64>::vlanes());
}
static inline void vx_load_pair_as(const int64_t* ptr, v_float64& a, v_float64& b)
{
a = v_cvt_f64(vx_load(ptr));
b = v_cvt_f64(vx_load(ptr + v_float64::nlanes));
b = v_cvt_f64(vx_load(ptr + VTraits<v_float64>::vlanes()));
}
static inline void vx_load_pair_as(const unsigned* ptr, v_float64& a, v_float64& b)
{
const int nlanes = v_uint64::nlanes;
double buf[v_uint64::nlanes*2];
const int nlanes = VTraits<v_uint64>::vlanes();
double buf[VTraits<v_uint64>::max_nlanes*2];
for (int i = 0; i < nlanes*2; i++)
buf[i] = (double)ptr[i];
a = vx_load(buf);
@ -607,7 +607,7 @@ static inline void v_store_as(double* ptr, const v_float32& a)
{
v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
v_store(ptr, fa0);
v_store(ptr + v_float64::nlanes, fa1);
v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
}
static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b)
@ -616,9 +616,9 @@ static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32&
v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
v_store(ptr, fa0);
v_store(ptr + v_float64::nlanes, fa1);
v_store(ptr + v_float64::nlanes*2, fb0);
v_store(ptr + v_float64::nlanes*3, fb1);
v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
}
static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b)
@ -627,15 +627,15 @@ static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_floa
v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
v_store(ptr, fa0);
v_store(ptr + v_float64::nlanes, fa1);
v_store(ptr + v_float64::nlanes*2, fb0);
v_store(ptr + v_float64::nlanes*3, fb1);
v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
}
static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b)
{
v_store(ptr, a);
v_store(ptr + v_float64::nlanes, b);
v_store(ptr + VTraits<v_float64>::vlanes(), b);
}
static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b)
@ -662,7 +662,7 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_float64& a, const v_fl
v_int64 ia, ib;
v_expand(v_round(v_max(a, z), v_max(b, z)), ia, ib);
v_store(ptr, v_reinterpret_as_u64(ia));
v_store(ptr + v_int64::nlanes, v_reinterpret_as_u64(ib));
v_store(ptr + VTraits<v_uint64>::vlanes(), v_reinterpret_as_u64(ib));
}
static inline void v_store_pair_as(int64_t* ptr, const v_float64& a, const v_float64& b)
@ -670,7 +670,7 @@ static inline void v_store_pair_as(int64_t* ptr, const v_float64& a, const v_flo
v_int64 ia, ib;
v_expand(v_round(a, b), ia, ib);
v_store(ptr, ia);
v_store(ptr + v_int64::nlanes, ib);
v_store(ptr + VTraits<v_uint64>::vlanes(), ib);
}
static inline void v_store_pair_as(unsigned* ptr, const v_float64& a, const v_float64& b)
@ -744,9 +744,9 @@ static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b
ptr[i] = (double)buf[i];
}
#endif /////////// CV_SIMD_64F
#endif /////////// CV_SIMD_64F || CV_SIMD_SCALABLE_64F
#endif /////////// CV_SIMD
#endif /////////// CV_SIMD || CV_SIMD_SCALABLE
}

@ -41,8 +41,8 @@ void cvt16f32f( const float16_t* src, float* dst, int len )
{
CV_INSTRUMENT_REGION();
int j = 0;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int VECSZ = VTraits<v_float32>::vlanes();
for( ; j < len; j += VECSZ )
{
if( j > len - VECSZ )
@ -62,8 +62,8 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
{
CV_INSTRUMENT_REGION();
int j = 0;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int VECSZ = VTraits<v_float32>::vlanes();
for( ; j < len; j += VECSZ )
{
if( j > len - VECSZ )
@ -83,8 +83,8 @@ void cvt32f16bf( const float* src, bfloat16_t* dst, int len )
{
CV_INSTRUMENT_REGION();
int j = 0;
#if CV_SIMD
const int VECSZ = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int VECSZ = VTraits<v_float32>::vlanes();
for( ; j < len; j += VECSZ )
{
if( j > len - VECSZ )
@ -153,8 +153,8 @@ cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
{
int j = 0;
#if CV_SIMD
const int VECSZ = _Twvec::nlanes*2;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int VECSZ = VTraits<_Twvec>::vlanes()*2;
for( ; j < size.width; j += VECSZ )
{
if( j > size.width - VECSZ )
@ -182,8 +182,8 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
{
int j = 0;
#if CV_SIMD_64F
const int VECSZ = v_float64::nlanes*2;
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
const int VECSZ = VTraits<v_float64>::vlanes()*2;
for( ; j < size.width; j += VECSZ )
{
if( j > size.width - VECSZ )
@ -213,8 +213,8 @@ cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
{
int j = 0;
#if CV_SIMD
const int VECSZ = _Twvec::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int VECSZ = VTraits<_Twvec>::vlanes();
for( ; j < size.width; j += VECSZ )
{
if( j > size.width - VECSZ )

@ -22,9 +22,9 @@ template<typename _Ts, typename _Td> inline void
cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
Size size, float a, float b )
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
const int VECSZ = v_float32::nlanes*2;
const int VECSZ = VTraits<v_float32>::vlanes()*2;
#endif
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
@ -32,7 +32,7 @@ cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
{
int j = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; j < size.width; j += VECSZ )
{
if( j > size.width - VECSZ )
@ -72,9 +72,9 @@ template<typename _Ts, typename _Td> inline void
cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
Size size, float a, float b )
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
const int VECSZ = v_float32::nlanes*2;
const int VECSZ = VTraits<v_float32>::vlanes()*2;
#endif
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
@ -82,7 +82,7 @@ cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
{
int j = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; j < size.width; j += VECSZ )
{
if( j > size.width - VECSZ )
@ -108,9 +108,9 @@ template<typename _Ts, typename _Td> inline void
cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
Size size, float a, float b )
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
const int VECSZ = v_float32::nlanes;
const int VECSZ = VTraits<v_float32>::vlanes();
#endif
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
@ -118,7 +118,7 @@ cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
{
int j = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
for( ; j < size.width; j += VECSZ )
{
if( j > size.width - VECSZ )
@ -143,9 +143,9 @@ template<typename _Ts, typename _Td> inline void
cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
Size size, double a, double b )
{
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b);
const int VECSZ = v_float64::nlanes*2;
const int VECSZ = VTraits<v_float64>::vlanes()*2;
#endif
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
@ -153,7 +153,7 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
{
int j = 0;
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
for( ; j < size.width; j += VECSZ )
{
if( j > size.width - VECSZ )

@ -171,15 +171,15 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
const uchar* src = (const uchar*)_src;
uchar* dst = (uchar*)_dst;
int x = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_uint8 v_zero = vx_setzero_u8();
for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
{
v_uint8 v_src = vx_load(src + x),
v_dst = vx_load(dst + x),
v_nmask = vx_load(mask + x) == v_zero;
v_nmask = v_eq(vx_load(mask + x), v_zero);
v_dst = v_select(v_nmask, v_dst, v_src);
v_store(dst + x, v_dst);
@ -203,23 +203,23 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
const ushort* src = (const ushort*)_src;
ushort* dst = (ushort*)_dst;
int x = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_uint8 v_zero = vx_setzero_u8();
for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
{
v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + v_uint16::nlanes),
v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + v_uint16::nlanes);
v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + VTraits<v_uint16>::vlanes()),
v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + VTraits<v_uint16>::vlanes());
v_uint8 v_nmask1, v_nmask2;
v_uint8 v_nmask = vx_load(mask + x) == v_zero;
v_uint8 v_nmask = v_eq(vx_load(mask + x), v_zero);
v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
v_store(dst + x, v_dst1);
v_store(dst + x + v_uint16::nlanes, v_dst2);
v_store(dst + x + VTraits<v_uint16>::vlanes(), v_dst2);
}
}
vx_cleanup();

@ -32,8 +32,8 @@ static int countNonZero_(const T* src, int len )
static int countNonZero8u( const uchar* src, int len )
{
int i=0, nz = 0;
#if CV_SIMD
int len0 = len & -v_uint8::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_uint8>::vlanes();
v_uint8 v_zero = vx_setzero_u8();
v_uint8 v_one = vx_setall_u8(1);
@ -42,20 +42,20 @@ static int countNonZero8u( const uchar* src, int len )
{
v_uint16 v_sum16 = vx_setzero_u16();
int j = i;
while (j < std::min(len0, i + 65280 * v_uint16::nlanes))
while (j < std::min(len0, i + 65280 * VTraits<v_uint16>::vlanes()))
{
v_uint8 v_sum8 = vx_setzero_u8();
int k = j;
for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes)
v_sum8 += v_one & (vx_load(src + k) == v_zero);
for (; k < std::min(len0, j + 255 * VTraits<v_uint8>::vlanes()); k += VTraits<v_uint8>::vlanes())
v_sum8 = v_add(v_sum8, v_and(v_one, v_eq(vx_load(src + k), v_zero)));
v_uint16 part1, part2;
v_expand(v_sum8, part1, part2);
v_sum16 += part1 + part2;
v_sum16 = v_add(v_sum16, v_add(part1, part2));
j = k;
}
v_uint32 part1, part2;
v_expand(v_sum16, part1, part2);
v_sum32 += part1 + part2;
v_sum32 = v_add(v_sum32, v_add(part1, part2));
i = j;
}
nz = i - v_reduce_sum(v_sum32);
@ -69,8 +69,8 @@ static int countNonZero8u( const uchar* src, int len )
static int countNonZero16u( const ushort* src, int len )
{
int i = 0, nz = 0;
#if CV_SIMD
int len0 = len & -v_int8::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_int8>::vlanes();
v_uint16 v_zero = vx_setzero_u16();
v_int8 v_one = vx_setall_s8(1);
@ -79,20 +79,20 @@ static int countNonZero16u( const ushort* src, int len )
{
v_int16 v_sum16 = vx_setzero_s16();
int j = i;
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
{
v_int8 v_sum8 = vx_setzero_s8();
int k = j;
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero));
for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_reinterpret_as_s16(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s16(v_eq(vx_load(src + k + VTraits<v_uint16>::vlanes()), v_zero)))));
v_int16 part1, part2;
v_expand(v_sum8, part1, part2);
v_sum16 += part1 + part2;
v_sum16 = v_add(v_sum16, v_add(part1, part2));
j = k;
}
v_int32 part1, part2;
v_expand(v_sum16, part1, part2);
v_sum32 += part1 + part2;
v_sum32 = v_add(v_sum32, v_add(part1, part2));
i = j;
}
nz = i - v_reduce_sum(v_sum32);
@ -104,8 +104,8 @@ static int countNonZero16u( const ushort* src, int len )
static int countNonZero32s( const int* src, int len )
{
int i = 0, nz = 0;
#if CV_SIMD
int len0 = len & -v_int8::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_int8>::vlanes();
v_int32 v_zero = vx_setzero_s32();
v_int8 v_one = vx_setall_s8(1);
@ -114,23 +114,20 @@ static int countNonZero32s( const int* src, int len )
{
v_int16 v_sum16 = vx_setzero_s16();
int j = i;
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
{
v_int8 v_sum8 = vx_setzero_s8();
int k = j;
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
v_sum8 += v_one & v_pack(
v_pack(vx_load(src + k ) == v_zero, vx_load(src + k + v_int32::nlanes) == v_zero),
v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero)
);
for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_eq(vx_load(src + k), v_zero), v_eq(vx_load(src + k + VTraits<v_int32>::vlanes()), v_zero)), v_pack(v_eq(vx_load(src + k + 2 * VTraits<v_int32>::vlanes()), v_zero), v_eq(vx_load(src + k + 3 * VTraits<v_int32>::vlanes()), v_zero)))));
v_int16 part1, part2;
v_expand(v_sum8, part1, part2);
v_sum16 += part1 + part2;
v_sum16 = v_add(v_sum16, v_add(part1, part2));
j = k;
}
v_int32 part1, part2;
v_expand(v_sum16, part1, part2);
v_sum32 += part1 + part2;
v_sum32 = v_add(v_sum32, v_add(part1, part2));
i = j;
}
nz = i - v_reduce_sum(v_sum32);
@ -142,8 +139,8 @@ static int countNonZero32s( const int* src, int len )
static int countNonZero32f( const float* src, int len )
{
int i = 0, nz = 0;
#if CV_SIMD
int len0 = len & -v_int8::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_int8>::vlanes();
v_float32 v_zero = vx_setzero_f32();
v_int8 v_one = vx_setall_s8(1);
@ -152,23 +149,20 @@ static int countNonZero32f( const float* src, int len )
{
v_int16 v_sum16 = vx_setzero_s16();
int j = i;
while (j < std::min(len0, i + 32766 * v_int16::nlanes))
while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
{
v_int8 v_sum8 = vx_setzero_s8();
int k = j;
for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
v_sum8 += v_one & v_pack(
v_pack(v_reinterpret_as_s32(vx_load(src + k ) == v_zero), v_reinterpret_as_s32(vx_load(src + k + v_float32::nlanes) == v_zero)),
v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero))
);
for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + VTraits<v_float32>::vlanes()), v_zero))), v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k + 2 * VTraits<v_float32>::vlanes()), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + 3 * VTraits<v_float32>::vlanes()), v_zero))))));
v_int16 part1, part2;
v_expand(v_sum8, part1, part2);
v_sum16 += part1 + part2;
v_sum16 = v_add(v_sum16, v_add(part1, part2));
j = k;
}
v_int32 part1, part2;
v_expand(v_sum16, part1, part2);
v_sum32 += part1 + part2;
v_sum32 = v_add(v_sum32, v_add(part1, part2));
i = j;
}
nz = i - v_reduce_sum(v_sum32);
@ -180,21 +174,21 @@ static int countNonZero32f( const float* src, int len )
static int countNonZero64f( const double* src, int len )
{
int nz = 0, i = 0;
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
v_int64 sum1 = vx_setzero_s64();
v_int64 sum2 = vx_setzero_s64();
v_float64 zero = vx_setzero_f64();
int step = v_float64::nlanes * 2;
int step = VTraits<v_float64>::vlanes() * 2;
int len0 = len & -step;
for(i = 0; i < len0; i += step )
{
sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
sum1 = v_add(sum1, v_reinterpret_as_s64(v_eq(vx_load(&src[i]), zero)));
sum2 = v_add(sum2, v_reinterpret_as_s64(v_eq(vx_load(&src[i + step / 2]), zero)));
}
// N.B the value is incremented by -1 (0xF...F) for each value
nz = i + (int)v_reduce_sum(sum1 + sum2);
nz = i + (int)v_reduce_sum(v_add(sum1, sum2));
v_cleanup();
#endif
return nz + countNonZero_(src + i, len - i);

@ -274,22 +274,21 @@ template<typename T> struct VBLAS
{
int dot(const T*, const T*, int, T*) const { return 0; }
int givens(T*, T*, int, T, T) const { return 0; }
int givensx(T*, T*, int, T, T, T*, T*) const { return 0; }
};
#if CV_SIMD
#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
{
if( n < 2*v_float32::nlanes )
if( n < 2*VTraits<v_float32>::vlanes() )
return 0;
int k = 0;
v_float32 s0 = vx_setzero_f32();
for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
{
v_float32 a0 = vx_load(a + k);
v_float32 b0 = vx_load(b + k);
s0 += a0 * b0;
s0 = v_add(s0, v_mul(a0, b0));
}
*result = v_reduce_sum(s0);
vx_cleanup();
@ -299,16 +298,16 @@ template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, f
template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, float s) const
{
if( n < v_float32::nlanes)
if( n < VTraits<v_float32>::vlanes())
return 0;
int k = 0;
v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
{
v_float32 a0 = vx_load(a + k);
v_float32 b0 = vx_load(b + k);
v_float32 t0 = (a0 * c4) + (b0 * s4);
v_float32 t1 = (b0 * c4) - (a0 * s4);
v_float32 t0 = v_add(v_mul(a0, c4), v_mul(b0, s4));
v_float32 t1 = v_sub(v_mul(b0, c4), v_mul(a0, s4));
v_store(a + k, t0);
v_store(b + k, t1);
}
@ -317,44 +316,19 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
}
template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c, float s,
float* anorm, float* bnorm) const
{
if( n < v_float32::nlanes)
return 0;
int k = 0;
v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
v_float32 sa = vx_setzero_f32(), sb = vx_setzero_f32();
for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
{
v_float32 a0 = vx_load(a + k);
v_float32 b0 = vx_load(b + k);
v_float32 t0 = (a0 * c4) + (b0 * s4);
v_float32 t1 = (b0 * c4) - (a0 * s4);
v_store(a + k, t0);
v_store(b + k, t1);
sa += t0 + t0;
sb += t1 + t1;
}
*anorm = v_reduce_sum(sa);
*bnorm = v_reduce_sum(sb);
vx_cleanup();
return k;
}
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n, double* result) const
{
if( n < 2*v_float64::nlanes )
if( n < 2*VTraits<v_float64>::vlanes() )
return 0;
int k = 0;
v_float64 s0 = vx_setzero_f64();
for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
{
v_float64 a0 = vx_load(a + k);
v_float64 b0 = vx_load(b + k);
s0 += a0 * b0;
s0 = v_add(s0, v_mul(a0, b0));
}
double sbuf[2];
v_store(sbuf, s0);
@ -368,12 +342,12 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
{
int k = 0;
v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
{
v_float64 a0 = vx_load(a + k);
v_float64 b0 = vx_load(b + k);
v_float64 t0 = (a0 * c2) + (b0 * s2);
v_float64 t1 = (b0 * c2) - (a0 * s2);
v_float64 t0 = v_add(v_mul(a0, c2), v_mul(b0, s2));
v_float64 t1 = v_sub(v_mul(b0, c2), v_mul(a0, s2));
v_store(a + k, t0);
v_store(b + k, t1);
}
@ -382,30 +356,6 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
}
template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double c, double s,
double* anorm, double* bnorm) const
{
int k = 0;
v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
v_float64 sa = vx_setzero_f64(), sb = vx_setzero_f64();
for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
{
v_float64 a0 = vx_load(a + k);
v_float64 b0 = vx_load(b + k);
v_float64 t0 = (a0 * c2) + (b0 * s2);
v_float64 t1 = (b0 * c2) - (a0 * s2);
v_store(a + k, t0);
v_store(b + k, t1);
sa += t0 * t0;
sb += t1 * t1;
}
double abuf[2], bbuf[2];
v_store(abuf, sa);
v_store(bbuf, sb);
*anorm = abuf[0] + abuf[1];
*bnorm = bbuf[0] + bbuf[1];
return k;
}
#endif //CV_SIMD_64F
#endif //CV_SIMD
@ -916,7 +866,7 @@ double invert( InputArray _src, OutputArray _dst, int method )
#if CV_SIMD128
const float d_32f = (float)d;
const v_float32x4 d_vec(d_32f, -d_32f, -d_32f, d_32f);
v_float32x4 s0 = v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * d_vec;//0123//3120
v_float32x4 s0 = v_mul(v_load_halves((const float *)srcdata, (const float *)(srcdata + srcstep)), d_vec);//0123//3120
s0 = v_extract<3>(s0, v_combine_low(v_rotate_right<1>(s0), s0));
v_store_low((float*)dstdata, s0);
v_store_high((float*)(dstdata + dststep), s0);
@ -942,10 +892,10 @@ double invert( InputArray _src, OutputArray _dst, int method )
d = 1./d;
#if CV_SIMD128_64F
v_float64x2 det = v_setall_f64(d);
v_float64x2 s0 = v_load((const double*)srcdata) * det;
v_float64x2 s1 = v_load((const double*)(srcdata+srcstep)) * det;
v_float64x2 s0 = v_mul(v_load((const double *)srcdata), det);
v_float64x2 s1 = v_mul(v_load((const double *)(srcdata + srcstep)), det);
v_float64x2 sm = v_extract<1>(s1, s0);//30
v_float64x2 ss = v_setall<double>(0) - v_extract<1>(s0, s1);//12
v_float64x2 ss = v_sub(v_setall<double>(0), v_extract<1>(s0, s1));//12
v_store((double*)dstdata, v_combine_low(sm, ss));//31
v_store((double*)(dstdata + dststep), v_combine_high(ss, sm));//20
#else

@ -614,13 +614,13 @@ void polarToCart( InputArray src1, InputArray src2,
{
k = 0;
#if CV_SIMD
int cWidth = v_float32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int cWidth = VTraits<v_float32>::vlanes();
for( ; k <= len - cWidth; k += cWidth )
{
v_float32 v_m = vx_load(mag + k);
v_store(x + k, vx_load(x + k) * v_m);
v_store(y + k, vx_load(y + k) * v_m);
v_store(x + k, v_mul(vx_load(x + k), v_m));
v_store(y + k, v_mul(vx_load(y + k), v_m));
}
vx_cleanup();
#endif
@ -741,7 +741,7 @@ struct iPow_SIMD
}
};
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
template <>
struct iPow_SIMD<uchar, int>
@ -751,7 +751,7 @@ struct iPow_SIMD<uchar, int>
int i = 0;
v_uint32 v_1 = vx_setall_u32(1u);
for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
{
v_uint32 v_a1 = v_1, v_a2 = v_1;
v_uint16 v = vx_load_expand(src + i);
@ -763,16 +763,16 @@ struct iPow_SIMD<uchar, int>
{
if (p & 1)
{
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
}
v_b1 *= v_b1;
v_b2 *= v_b2;
v_b1 = v_mul(v_b1, v_b1);
v_b2 = v_mul(v_b2, v_b2);
p >>= 1;
}
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
v = v_pack(v_a1, v_a2);
v_pack_store(dst + i, v);
@ -791,7 +791,7 @@ struct iPow_SIMD<schar, int>
int i = 0;
v_int32 v_1 = vx_setall_s32(1);
for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
{
v_int32 v_a1 = v_1, v_a2 = v_1;
v_int16 v = vx_load_expand(src + i);
@ -803,16 +803,16 @@ struct iPow_SIMD<schar, int>
{
if (p & 1)
{
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
}
v_b1 *= v_b1;
v_b2 *= v_b2;
v_b1 = v_mul(v_b1, v_b1);
v_b2 = v_mul(v_b2, v_b2);
p >>= 1;
}
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
v = v_pack(v_a1, v_a2);
v_pack_store(dst + i, v);
@ -831,7 +831,7 @@ struct iPow_SIMD<ushort, int>
int i = 0;
v_uint32 v_1 = vx_setall_u32(1u);
for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
{
v_uint32 v_a1 = v_1, v_a2 = v_1;
v_uint16 v = vx_load(src + i);
@ -843,16 +843,16 @@ struct iPow_SIMD<ushort, int>
{
if (p & 1)
{
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
}
v_b1 *= v_b1;
v_b2 *= v_b2;
v_b1 = v_mul(v_b1, v_b1);
v_b2 = v_mul(v_b2, v_b2);
p >>= 1;
}
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
v = v_pack(v_a1, v_a2);
v_store(dst + i, v);
@ -871,7 +871,7 @@ struct iPow_SIMD<short, int>
int i = 0;
v_int32 v_1 = vx_setall_s32(1);
for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
{
v_int32 v_a1 = v_1, v_a2 = v_1;
v_int16 v = vx_load(src + i);
@ -883,16 +883,16 @@ struct iPow_SIMD<short, int>
{
if (p & 1)
{
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
}
v_b1 *= v_b1;
v_b2 *= v_b2;
v_b1 = v_mul(v_b1, v_b1);
v_b2 = v_mul(v_b2, v_b2);
p >>= 1;
}
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
v = v_pack(v_a1, v_a2);
v_store(dst + i, v);
@ -911,29 +911,29 @@ struct iPow_SIMD<int, int>
int i = 0;
v_int32 v_1 = vx_setall_s32(1);
for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2)
for ( ; i <= len - VTraits<v_int32>::vlanes()*2; i += VTraits<v_int32>::vlanes()*2)
{
v_int32 v_a1 = v_1, v_a2 = v_1;
v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes);
v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_int32>::vlanes());
int p = power;
while( p > 1 )
{
if (p & 1)
{
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
}
v_b1 *= v_b1;
v_b2 *= v_b2;
v_b1 = v_mul(v_b1, v_b1);
v_b2 = v_mul(v_b2, v_b2);
p >>= 1;
}
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
v_store(dst + i, v_a1);
v_store(dst + i + v_int32::nlanes, v_a2);
v_store(dst + i + VTraits<v_int32>::vlanes(), v_a2);
}
vx_cleanup();
@ -949,34 +949,34 @@ struct iPow_SIMD<float, float>
int i = 0;
v_float32 v_1 = vx_setall_f32(1.f);
for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2)
for ( ; i <= len - VTraits<v_float32>::vlanes()*2; i += VTraits<v_float32>::vlanes()*2)
{
v_float32 v_a1 = v_1, v_a2 = v_1;
v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes);
v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float32>::vlanes());
int p = std::abs(power);
if( power < 0 )
{
v_b1 = v_1 / v_b1;
v_b2 = v_1 / v_b2;
v_b1 = v_div(v_1, v_b1);
v_b2 = v_div(v_1, v_b2);
}
while( p > 1 )
{
if (p & 1)
{
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
}
v_b1 *= v_b1;
v_b2 *= v_b2;
v_b1 = v_mul(v_b1, v_b1);
v_b2 = v_mul(v_b2, v_b2);
p >>= 1;
}
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
v_store(dst + i, v_a1);
v_store(dst + i + v_float32::nlanes, v_a2);
v_store(dst + i + VTraits<v_float32>::vlanes(), v_a2);
}
vx_cleanup();
@ -984,7 +984,7 @@ struct iPow_SIMD<float, float>
}
};
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template <>
struct iPow_SIMD<double, double>
{
@ -993,34 +993,34 @@ struct iPow_SIMD<double, double>
int i = 0;
v_float64 v_1 = vx_setall_f64(1.);
for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2)
for ( ; i <= len - VTraits<v_float64>::vlanes()*2; i += VTraits<v_float64>::vlanes()*2)
{
v_float64 v_a1 = v_1, v_a2 = v_1;
v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes);
v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float64>::vlanes());
int p = std::abs(power);
if( power < 0 )
{
v_b1 = v_1 / v_b1;
v_b2 = v_1 / v_b2;
v_b1 = v_div(v_1, v_b1);
v_b2 = v_div(v_1, v_b2);
}
while( p > 1 )
{
if (p & 1)
{
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
}
v_b1 *= v_b1;
v_b2 *= v_b2;
v_b1 = v_mul(v_b1, v_b1);
v_b2 = v_mul(v_b2, v_b2);
p >>= 1;
}
v_a1 *= v_b1;
v_a2 *= v_b2;
v_a1 = v_mul(v_a1, v_b1);
v_a2 = v_mul(v_a2, v_b2);
v_store(dst + i, v_a1);
v_store(dst + i + v_float64::nlanes, v_a2);
v_store(dst + i + VTraits<v_float64>::vlanes(), v_a2);
}
vx_cleanup();
@ -1614,7 +1614,7 @@ void patchNaNs( InputOutputArray _a, double _val )
Cv32suf val;
val.f = (float)_val;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000);
v_int32 v_val = vx_setall_s32(val.i);
#endif
@ -1624,12 +1624,12 @@ void patchNaNs( InputOutputArray _a, double _val )
int* tptr = ptrs[0];
size_t j = 0;
#if CV_SIMD
size_t cWidth = (size_t)v_int32::nlanes;
#if (CV_SIMD || CV_SIMD_SCALABLE)
size_t cWidth = (size_t)VTraits<v_int32>::vlanes();
for ( ; j + cWidth <= len; j += cWidth)
{
v_int32 v_src = vx_load(tptr + j);
v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1);
v_int32 v_cmp_mask = v_lt(v_mask2, v_and(v_src, v_mask1));
v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src);
v_store(tptr + j, v_dst);
}

@ -1454,7 +1454,7 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
static void
transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int BITS = 10, SCALE = 1 << BITS;
const float MAX_M = (float)(1 << (15 - BITS));
@ -1485,7 +1485,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
v_int32 m10 = vx_setall_s32(m32[4]);
v_int32 m11 = vx_setall_s32(m32[5]);
int x = 0;
for (; x <= (len - v_uint8::nlanes) * nChannels; x += v_uint8::nlanes * nChannels)
for (; x <= (len - VTraits<v_uint8>::vlanes()) * nChannels; x += VTraits<v_uint8>::vlanes() * nChannels)
{
v_uint8 b, g, r;
v_load_deinterleave(src + x, b, g, r);
@ -1499,20 +1499,20 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
v_int32 p1, p3;
v_expand(bgl, p0, p2);
v_expand(v_reinterpret_as_s16(rl), p1, p3);
dbl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3,
v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3);
dgl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7,
v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7);
drl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
dbl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
dgl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
drl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
v_expand(bgh, p0, p2);
v_expand(v_reinterpret_as_s16(rh), p1, p3);
dbh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 * m2 + m3,
v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 * m2 + m3);
dgh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 * m6 + m7,
v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 * m6 + m7);
drh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
dbh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
dgh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
drh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
v_store_interleave(dst + x, v_pack_u(dbl, dbh), v_pack_u(dgl, dgh), v_pack_u(drl, drh));
}
m32[1] = saturate_cast<int>((m[3] + 0.5f)*SCALE);
@ -1537,7 +1537,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
static void
transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( scn == 3 && dcn == 3 )
{
int x = 0;
@ -1555,7 +1555,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
v_float32 m10 = vx_setall_f32(m[10]);
v_float32 m11 = vx_setall_f32(m[11] - 32768.f);
v_int16 delta = vx_setall_s16(-32768);
for (; x <= (len - v_uint16::nlanes)*3; x += v_uint16::nlanes*3)
for (; x <= (len - VTraits<v_uint16>::vlanes())*3; x += VTraits<v_uint16>::vlanes()*3)
{
v_uint16 b, g, r;
v_load_deinterleave(src + x, b, g, r);
@ -1574,6 +1574,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
v_store_interleave(dst + x, v_reinterpret_as_u16(db), v_reinterpret_as_u16(dg), v_reinterpret_as_u16(dr));
}
#endif
#if CV_SIMD128
v_float32x4 _m0l(m[0], m[4], m[ 8], 0.f);
v_float32x4 _m1l(m[1], m[5], m[ 9], 0.f);
v_float32x4 _m2l(m[2], m[6], m[10], 0.f);
@ -1587,6 +1588,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack(
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x ))), _m0h, _m1h, _m2h, _m3h)),
v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta))));
#endif //CV_SIMD128
for( ; x < len * 3; x += 3 )
{
float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
@ -1606,25 +1608,25 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
static void
transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
{
#if CV_SIMD && !defined(__aarch64__) && !defined(_M_ARM64)
#if (CV_SIMD || CV_SIMD_SCALABLE) && !defined(__aarch64__) && !defined(_M_ARM64)
int x = 0;
if( scn == 3 && dcn == 3 )
{
int idx[v_float32::nlanes/2];
for( int i = 0; i < v_float32::nlanes/4; i++ )
int idx[VTraits<v_float32>::max_nlanes/2];
for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
{
idx[i] = 3*i;
idx[i + v_float32::nlanes/4] = 0;
idx[i + VTraits<v_float32>::vlanes()/4] = 0;
}
float _m[] = { m[0], m[4], m[ 8], 0.f,
m[1], m[5], m[ 9], 0.f,
m[2], m[6], m[10], 0.f,
m[3], m[7], m[11], 0.f };
v_float32 m0 = vx_lut_quads(_m , idx + v_float32::nlanes/4);
v_float32 m1 = vx_lut_quads(_m + 4, idx + v_float32::nlanes/4);
v_float32 m2 = vx_lut_quads(_m + 8, idx + v_float32::nlanes/4);
v_float32 m3 = vx_lut_quads(_m + 12, idx + v_float32::nlanes/4);
for( ; x <= len*3 - v_float32::nlanes; x += 3*v_float32::nlanes/4 )
v_float32 m0 = vx_lut_quads(_m , idx + VTraits<v_float32>::vlanes()/4);
v_float32 m1 = vx_lut_quads(_m + 4, idx + VTraits<v_float32>::vlanes()/4);
v_float32 m2 = vx_lut_quads(_m + 8, idx + VTraits<v_float32>::vlanes()/4);
v_float32 m3 = vx_lut_quads(_m + 12, idx + VTraits<v_float32>::vlanes()/4);
for( ; x <= len*3 - VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes()/4 )
v_store(dst + x, v_pack_triplets(v_matmuladd(vx_lut_quads(src + x, idx), m0, m1, m2, m3)));
for( ; x < len*3; x += 3 )
{
@ -1641,8 +1643,8 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
if( scn == 4 && dcn == 4 )
{
#if CV_SIMD_WIDTH > 16
int idx[v_float32::nlanes/4];
for( int i = 0; i < v_float32::nlanes/4; i++ )
int idx[VTraits<v_float32>::max_nlanes/4];
for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
idx[i] = 0;
float _m[] = { m[4], m[9], m[14], m[19] };
v_float32 m0 = vx_lut_quads(m , idx);
@ -1650,12 +1652,13 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
v_float32 m2 = vx_lut_quads(m+10, idx);
v_float32 m3 = vx_lut_quads(m+15, idx);
v_float32 m4 = vx_lut_quads(_m, idx);
for( ; x <= len*4 - v_float32::nlanes; x += v_float32::nlanes )
for( ; x <= len*4 - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes() )
{
v_float32 v_src = vx_load(src + x);
v_store(dst + x, v_reduce_sum4(v_src * m0, v_src * m1, v_src * m2, v_src * m3) + m4);
v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, m0), v_mul(v_src, m1), v_mul(v_src, m2), v_mul(v_src, m3)), m4));
}
#endif
#if CV_SIMD128
v_float32x4 _m0 = v_load(m );
v_float32x4 _m1 = v_load(m + 5);
v_float32x4 _m2 = v_load(m + 10);
@ -1666,6 +1669,17 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
v_float32x4 v_src = v_load(src + x);
v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4);
}
#else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128
for( ; x < len*4; x += 4 )
{
float v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[ 2]*v2 + m[ 3]*v3 + m[ 4]);
float t1 = saturate_cast<float>(m[5]*v0 + m[6]*v1 + m[ 7]*v2 + m[ 8]*v3 + m[ 9]);
float t2 = saturate_cast<float>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
float t3 = saturate_cast<float>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; dst[x+3] = t3;
}
#endif
vx_cleanup();
return;
}
@ -1936,9 +1950,9 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
{
float alpha = *_alpha;
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 v_alpha = vx_setall_f32(alpha);
const int cWidth = v_float32::nlanes;
const int cWidth = VTraits<v_float32>::vlanes();
for (; i <= len - cWidth; i += cWidth)
v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i)));
vx_cleanup();
@ -1953,9 +1967,9 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
{
double alpha = *_alpha;
int i = 0;
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
v_float64 a2 = vx_setall_f64(alpha);
const int cWidth = v_float64::nlanes;
const int cWidth = VTraits<v_float64>::vlanes();
for (; i <= len - cWidth; i += cWidth)
v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i)));
vx_cleanup();
@ -2078,7 +2092,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
deltastep = deltastep ? 4 : 0;
}
#if CV_SIMD_64F
#if CV_SIMD128_64F
v_float64x2 v_scale = v_setall_f64(scale);
#endif
@ -2090,7 +2104,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
for( j = i; j <= size.width - 4; j += 4 )
{
#if CV_SIMD_64F
#if CV_SIMD128_64F
if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
{
v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
@ -2150,7 +2164,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
for( j = i; j <= size.width - 4; j += 4 )
{
#if CV_SIMD_64F
#if CV_SIMD128_64F
if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
{
v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
@ -2227,7 +2241,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
double s = 0;
const sT *tsrc1 = src + i*srcstep;
const sT *tsrc2 = src + j*srcstep;
#if CV_SIMD_64F
#if CV_SIMD128_64F
if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
{
const double *v_tsrc1 = (double *)(tsrc1);
@ -2280,7 +2294,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
delta_buf[2] = delta_buf[3] = tdelta2[0];
tdelta2 = delta_buf;
}
#if CV_SIMD_64F
#if CV_SIMD128_64F
if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
{
const double *v_tsrc2 = (double *)(tsrc2);
@ -2393,14 +2407,14 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
double r = 0;
int i = 0;
#if CV_SIMD
int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 15), blockSize;
while (i < len0)
{
blockSize = std::min(len0 - i, blockSize0);
v_uint32 v_sum = vx_setzero_u32();
const int cWidth = v_uint16::nlanes;
const int cWidth = VTraits<v_uint16>::vlanes();
int j = 0;
for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
@ -2414,7 +2428,7 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
{
v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20));
v_sum = v_add(v_sum, v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20)));
}
r += (double)v_reduce_sum(v_sum);
@ -2433,14 +2447,14 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
double r = 0.0;
int i = 0;
#if CV_SIMD
int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 14), blockSize;
while (i < len0)
{
blockSize = std::min(len0 - i, blockSize0);
v_int32 v_sum = vx_setzero_s32();
const int cWidth = v_int16::nlanes;
const int cWidth = VTraits<v_int16>::vlanes();
int j = 0;
for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
@ -2473,14 +2487,14 @@ double dotProd_16u(const ushort* src1, const ushort* src2, int len)
double r = 0.0;
int i = 0;
#if CV_SIMD
int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 24), blockSize;
while (i < len0)
{
blockSize = std::min(len0 - i, blockSize0);
v_uint64 v_sum = vx_setzero_u64();
const int cWidth = v_uint16::nlanes;
const int cWidth = VTraits<v_uint16>::vlanes();
int j = 0;
for (; j <= blockSize - cWidth; j += cWidth)
@ -2505,14 +2519,14 @@ double dotProd_16s(const short* src1, const short* src2, int len)
double r = 0.0;
int i = 0;
#if CV_SIMD
int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 24), blockSize;
while (i < len0)
{
blockSize = std::min(len0 - i, blockSize0);
v_int64 v_sum = vx_setzero_s64();
const int cWidth = v_int16::nlanes;
const int cWidth = VTraits<v_int16>::vlanes();
int j = 0;
for (; j <= blockSize - cWidth; j += cWidth)
@ -2534,10 +2548,10 @@ double dotProd_16s(const short* src1, const short* src2, int len)
double dotProd_32s(const int* src1, const int* src2, int len)
{
#if CV_SIMD_64F
#if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
double r = .0;
int i = 0;
const int step = v_int32::nlanes;
const int step = VTraits<v_int32>::vlanes();
v_float64 v_sum0 = vx_setzero_f64();
#if CV_SIMD_WIDTH == 16
const int wstep = step * 2;
@ -2572,8 +2586,8 @@ double dotProd_32f(const float* src1, const float* src2, int len)
double r = 0.0;
int i = 0;
#if CV_SIMD
int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize;
#if (CV_SIMD || CV_SIMD_SCALABLE)
int len0 = len & -VTraits<v_float32>::vlanes(), blockSize0 = (1 << 13), blockSize;
while (i < len0)
{
@ -2581,7 +2595,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
v_float32 v_sum = vx_setzero_f32();
int j = 0;
int cWidth = v_float32::nlanes;
int cWidth = VTraits<v_float32>::vlanes();
#if CV_ENABLE_UNROLLED
v_float32 v_sum1 = vx_setzero_f32();
@ -2600,7 +2614,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
vx_load(src2 + j + (cWidth * 3)), v_sum3);
}
v_sum += v_sum1 + v_sum2 + v_sum3;
v_sum = v_add(v_sum, v_add(v_add(v_sum1, v_sum2), v_sum3));
#endif
for (; j <= blockSize - cWidth; j += cWidth)

@ -7,6 +7,7 @@
#include "opencv2/core/detail/dispatch_helper.impl.hpp"
#include <algorithm> // std::swap_ranges
#include <numeric> // std::accumulate
namespace cv {
@ -440,7 +441,7 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const
static void
flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{
#if CV_SIMD
#if CV_SIMD128
#if CV_STRONG_ALIGNMENT
size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
#endif
@ -563,7 +564,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
}
#endif
else
#endif // CV_SIMD
#endif // CV_SIMD128
{
int i, j, limit = (int)(((size.width + 1)/2)*esz);
AutoBuffer<int> _tab(size.width*esz);
@ -596,12 +597,12 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
dst0 += dstep, dst1 -= dstep )
{
int i = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
#if CV_STRONG_ALIGNMENT
if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
#endif
{
for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
{
v_int32 t0 = v_reinterpret_as_s32(vx_load(src0 + i));
v_int32 t1 = v_reinterpret_as_s32(vx_load(src1 + i));
@ -612,7 +613,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
#if CV_STRONG_ALIGNMENT
else
{
for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
{
v_uint8 t0 = vx_load(src0 + i);
v_uint8 t1 = vx_load(src1 + i);
@ -857,6 +858,223 @@ void flipND(InputArray _src, OutputArray _dst, int _axis)
flipNDImpl(dst.ptr(), dst.size.p, dst.step.p, axis);
}
/*
This function first prepends 1 to each tensor shape to have a common max_ndims dimension, then flatten non-broadcast dimensions.
*/
static bool _flatten_for_broadcast(int narrays, int max_ndims, const int* ndims, const int** orig_shape,
int** flatten_shape, size_t** flatten_step) {
int i, j, k;
// step 1.
// * make all inputs and the output max_ndims-dimensional.
// * compute proper step's
for (i = max_ndims - 1; i >= 0; i-- ) {
for (k = 0; k < narrays; k++) {
j = ndims[k] - (max_ndims - i);
int sz_i = j >= 0 ? orig_shape[k][j] : 1;
size_t st_i = i == max_ndims - 1 ? 1 : flatten_step[k][i+1] * flatten_shape[k][i+1];
flatten_shape[k][i] = sz_i;
flatten_step[k][i] = st_i;
if (flatten_shape[k][i] == 0)
return false;
}
}
// step 2. Let's do the flattening first,
// since we'd need proper values of steps to check continuity.
// this loop is probably the most tricky part
// in the whole implementation of broadcasting.
j = max_ndims-1;
for (i = j - 1; i >= 0; i--) {
bool all_contiguous = true, all_scalars = true, all_consistent = true;
for(k = 0; k < narrays; k++) {
size_t st = flatten_step[k][j] * flatten_shape[k][j];
bool prev_scalar = flatten_shape[k][j] == 1;
bool scalar = flatten_shape[k][i] == 1;
all_contiguous = all_contiguous && (st == flatten_step[k][i]);
all_scalars = all_scalars && scalar;
all_consistent = all_consistent && (scalar == prev_scalar);
}
if (all_contiguous && (all_consistent || all_scalars)) {
for(k = 0; k < narrays; k++)
flatten_shape[k][j] *= flatten_shape[k][i];
} else {
j--;
if (i < j) {
for(k = 0; k < narrays; k++) {
flatten_shape[k][j] = flatten_shape[k][i];
flatten_step[k][j] = flatten_step[k][i];
}
}
}
}
// step 3. Set some step's to 0's.
for (i = max_ndims-1; i >= j; i--) {
for (k = 0; k < narrays; k++)
flatten_step[k][i] = flatten_shape[k][i] == 1 ? 0 : flatten_step[k][i];
}
for (; i >= 0; i--) {
for (k = 0; k < narrays; k++) {
flatten_step[k][i] = 0;
flatten_shape[k][i] = 1;
}
}
return true;
}
void broadcast(InputArray _src, InputArray _shape, OutputArray _dst) {
CV_INSTRUMENT_REGION();
Mat src = _src.getMat();
CV_CheckTrue(src.isContinuous(), "broadcast: input array must be contiguous");
CV_CheckChannelsEQ(src.channels(), 1, "broadcast: input array must be single channel");
Mat shape = _shape.getMat();
CV_CheckTypeEQ(shape.type(), CV_32S, "broadcast: target shape must be of type int32");
const auto dims_shape = static_cast<int>(shape.total());
const auto *ptr_shape = shape.ptr<int>();
// check valid shape, 1D/0D Mat would fail in the following checks
const auto dims_src = src.dims;
CV_CheckLE(dims_src, dims_shape,
"broadcast: dimension of input array must be less than or equal to dimension of target shape");
std::vector<int> shape_src{src.size.p, src.size.p + dims_src};
if (shape_src.size() < static_cast<size_t>(dims_shape)) {
shape_src.insert(shape_src.begin(), dims_shape - shape_src.size(), 1);
}
for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
const auto *shape_target = ptr_shape;
if (shape_src[i] != 1) {
CV_CheckEQ(shape_src[i], shape_target[i], "target shape must be equal to input shape or 1");
}
}
// impl
_dst.create(dims_shape, shape.ptr<int>(), src.type());
Mat dst = _dst.getMat();
std::vector<int> is_same_shape(dims_shape, 0);
for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
if (shape_src[i] == ptr_shape[i]) {
is_same_shape[i] = 1;
}
}
// copy if same shape
if (std::accumulate(is_same_shape.begin(), is_same_shape.end(), 1, std::multiplies<int>()) != 0) {
const auto *p_src = src.ptr<const char>();
auto *p_dst = dst.ptr<char>();
std::memcpy(p_dst, p_src, dst.total() * dst.elemSize());
return;
}
// other cases
int max_ndims = std::max(dims_src, dims_shape);
const int all_ndims[2] = {src.dims, dst.dims};
const int* orig_shapes[2] = {src.size.p, dst.size.p};
cv::AutoBuffer<size_t> buff(max_ndims * 4);
int* flatten_shapes[2] = {(int*)buff.data(), (int*)(buff.data() + max_ndims)};
size_t* flatten_steps[2] = {(size_t*)(buff.data() + 2 * max_ndims), (size_t*)(buff.data() + 3 * max_ndims)};
if (_flatten_for_broadcast(2, max_ndims, all_ndims, orig_shapes, flatten_shapes, flatten_steps)) {
size_t src_dp = flatten_steps[0][max_ndims - 1];
size_t dst_dp = flatten_steps[1][max_ndims - 1];
CV_Assert(dst_dp == 1);
CV_Assert(max_ndims >= 2); // >= 3?
size_t rowstep_src = flatten_steps[0][max_ndims - 2];
size_t rowstep_dst = flatten_steps[1][max_ndims - 2];
const char* ptr_src = src.ptr<const char>();
char* ptr_dst = dst.ptr<char>();
size_t esz = src.elemSize();
int nrows = flatten_shapes[1][max_ndims - 2];
int ncols = flatten_shapes[1][max_ndims - 1];
int nplanes = 1;
CV_Check(esz, esz == 1 || esz == 2 || esz == 4 || esz == 8, "broadcast: not supported data type");
for (int k = 0; k < max_ndims - 2; k++) {
nplanes *= flatten_shapes[1][k];
}
for (int plane_idx = 0; plane_idx < nplanes; plane_idx++) {
size_t offset_src = 0, offset_dst = 0;
size_t idx = (size_t)plane_idx;
for (int k = max_ndims - 3; k >= 0; k--) {
size_t prev_idx = idx / flatten_shapes[1][k];
size_t i_k = (int)(idx - prev_idx * flatten_shapes[1][k]);
offset_src += i_k * flatten_steps[0][k];
offset_dst += i_k * flatten_steps[1][k];
idx = prev_idx;
}
#define OPENCV_CORE_BROADCAST_LOOP(_Tp) \
for (int i = 0; i < nrows; i++) { \
const _Tp *ptr_src_ = (const _Tp*)ptr_src + offset_src + rowstep_src * i; \
_Tp *ptr_dst_ = (_Tp*)ptr_dst + offset_dst + rowstep_dst * i; \
if (src_dp == 1) { \
for (int j = 0; j < ncols; j++) { \
ptr_dst_[j] = ptr_src_[j]; \
} \
} else { \
_Tp x = *ptr_src_; \
for (int j = 0; j < ncols; j++) { \
ptr_dst_[j] = x; \
} \
} \
}
if (esz == 1) {
OPENCV_CORE_BROADCAST_LOOP(int8_t);
} else if (esz == 2) {
OPENCV_CORE_BROADCAST_LOOP(int16_t);
} else if (esz == 4) {
OPENCV_CORE_BROADCAST_LOOP(int32_t);
} else if (esz == 8) {
OPENCV_CORE_BROADCAST_LOOP(int64_t);
} else {
CV_Error(cv::Error::StsNotImplemented, "");
}
#undef OPENCV_CORE_BROADCAST_LOOP
}
} else {
// initial copy (src to dst)
std::vector<size_t> step_src{src.step.p, src.step.p + dims_src};
if (step_src.size() < static_cast<size_t>(dims_shape)) {
step_src.insert(step_src.begin(), dims_shape - step_src.size(), step_src[0]);
}
for (size_t i = 0; i < src.total(); ++i) {
size_t t = i;
size_t src_offset = 0, dst_offset = 0;
for (int j = static_cast<int>(shape_src.size() - 1); j >= 0; --j) {
size_t idx = t / shape_src[j];
size_t offset = static_cast<size_t>(t - idx * shape_src[j]);
src_offset += offset * step_src[j];
dst_offset += offset * dst.step[j];
t = idx;
}
const auto *p_src = src.ptr<const char>();
auto *p_dst = dst.ptr<char>();
std::memcpy(p_dst + dst_offset, p_src + src_offset, dst.elemSize());
}
// broadcast copy (dst inplace)
std::vector<int> cumulative_shape(dims_shape, 1);
int total = static_cast<int>(dst.total());
for (int i = dims_shape - 1; i >= 0; --i) {
cumulative_shape[i] = static_cast<int>(total / ptr_shape[i]);
total = cumulative_shape[i];
}
for (int i = dims_shape - 1; i >= 0; --i) {
if (is_same_shape[i] == 1) {
continue;
}
auto step = dst.step[i];
auto *p_dst = dst.ptr<char>();
for (int j = 0; j < cumulative_shape[i]; j++) {
for (int k = 0; k < ptr_shape[i] - 1; k++) {
std::memcpy(p_dst + step, p_dst, step);
p_dst += step;
}
p_dst += step;
}
}
}
}
void rotate(InputArray _src, OutputArray _dst, int rotateMode)
{
CV_Assert(_src.dims() <= 2);

@ -8,20 +8,24 @@
#include "opencv2/core/openvx/ovx_defs.hpp"
#include "stat.hpp"
#ifndef OPENCV_IPP_MEAN
#undef HAVE_IPP
#undef CV_IPP_RUN_FAST
#define CV_IPP_RUN_FAST(f, ...)
#undef CV_IPP_RUN
#define CV_IPP_RUN(c, f, ...)
#endif // OPENCV_IPP_MEAN
#include "mean.simd.hpp"
#include "mean.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
#ifndef OPENCV_IPP_MEAN
#undef HAVE_IPP
#undef CV_IPP_RUN_FAST
#define CV_IPP_RUN_FAST(f, ...)
#undef CV_IPP_RUN
#define CV_IPP_RUN(c, f, ...)
#endif // OPENCV_IPP_MEAN
namespace cv {

@ -121,6 +121,7 @@ void merge(const Mat* mv, size_t n, OutputArray _dst)
CV_INSTRUMENT_REGION();
CV_Assert( mv && n > 0 );
CV_Assert(!mv[0].empty());
int depth = mv[0].depth();
bool allch1 = true;

@ -15,7 +15,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
/*
The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
on IA there are instructions movntps and such to which
@ -38,7 +38,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
template<typename T, typename VecT> static void
vecmerge_( const T** src, T* dst, int len, int cn )
{
const int VECSZ = VecT::nlanes;
const int VECSZ = VTraits<VecT>::vlanes();
int i, i0 = 0;
const T* src0 = src[0];
const T* src1 = src[1];
@ -173,8 +173,8 @@ merge_( const T** src, T* dst, int len, int cn )
void merge8u(const uchar** src, uchar* dst, int len, int cn )
{
CV_INSTRUMENT_REGION();
#if CV_SIMD
if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
vecmerge_<uchar, v_uint8>(src, dst, len, cn);
else
#endif
@ -184,8 +184,8 @@ void merge8u(const uchar** src, uchar* dst, int len, int cn )
void merge16u(const ushort** src, ushort* dst, int len, int cn )
{
CV_INSTRUMENT_REGION();
#if CV_SIMD
if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
vecmerge_<ushort, v_uint16>(src, dst, len, cn);
else
#endif
@ -195,8 +195,8 @@ void merge16u(const ushort** src, ushort* dst, int len, int cn )
void merge32s(const int** src, int* dst, int len, int cn )
{
CV_INSTRUMENT_REGION();
#if CV_SIMD
if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( len >= VTraits<v_int32>::vlanes() && 2 <= cn && cn <= 4 )
vecmerge_<int, v_int32>(src, dst, len, cn);
else
#endif
@ -206,8 +206,8 @@ void merge32s(const int** src, int* dst, int len, int cn )
void merge64s(const int64** src, int64* dst, int len, int cn )
{
CV_INSTRUMENT_REGION();
#if CV_SIMD
if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
vecmerge_<int64, v_int64>(src, dst, len, cn);
else
#endif

@ -11,11 +11,13 @@
#include <algorithm>
#ifndef OPENCV_IPP_MINMAX
#undef HAVE_IPP
#undef CV_IPP_RUN_FAST
#define CV_IPP_RUN_FAST(f, ...)
#undef CV_IPP_RUN
#define CV_IPP_RUN(c, f, ...)
#endif // OPENCV_IPP_MINMAX
#define IPP_DISABLE_MINMAXIDX_MANY_ROWS 1 // see Core_MinMaxIdx.rows_overflow test

@ -63,25 +63,25 @@ int normHamming(const uchar* a, int n, int cellSize)
return -1;
int i = 0;
int result = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_uint64 t = vx_setzero_u64();
if ( cellSize == 2)
{
v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
{
v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
t += v_popcount(v_reinterpret_as_u64((a0 | (a0 >> 1)) & mask));
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a0, v_shr<1>(a0)), mask))));
}
}
else // cellSize == 4
{
v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
{
v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
v_uint16 a1 = a0 | (a0 >> 2);
t += v_popcount(v_reinterpret_as_u64((a1 | (a1 >> 1)) & mask));
v_uint16 a1 = v_or(a0, v_shr<2>(a0));
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a1, v_shr<1>(a1)), mask))));
}
}
@ -109,25 +109,25 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
return -1;
int i = 0;
int result = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_uint64 t = vx_setzero_u64();
if ( cellSize == 2)
{
v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
{
v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
t += v_popcount(v_reinterpret_as_u64((ab0 | (ab0 >> 1)) & mask));
v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab0, v_shr<1>(ab0)), mask))));
}
}
else // cellSize == 4
{
v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
{
v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
v_uint16 ab1 = ab0 | (ab0 >> 2);
t += v_popcount(v_reinterpret_as_u64((ab1 | (ab1 >> 1)) & mask));
v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
v_uint16 ab1 = v_or(ab0, v_shr<2>(ab0));
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab1, v_shr<1>(ab1)), mask))));
}
}
result += (int)v_reduce_sum(t);
@ -145,21 +145,21 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
float normL2Sqr_(const float* a, const float* b, int n)
{
int j = 0; float d = 0.f;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
{
v_float32 t0 = vx_load(a + j) - vx_load(b + j);
v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
v_float32 t0 = v_sub(vx_load(a + j), vx_load(b + j));
v_float32 t1 = v_sub(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes()));
v_d0 = v_muladd(t0, t0, v_d0);
v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
v_float32 t2 = v_sub(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes()));
v_d1 = v_muladd(t1, t1, v_d1);
v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
v_float32 t3 = v_sub(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes()));
v_d2 = v_muladd(t2, t2, v_d2);
v_d3 = v_muladd(t3, t3, v_d3);
}
d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
#endif
for( ; j < n; j++ )
{
@ -173,17 +173,17 @@ float normL2Sqr_(const float* a, const float* b, int n)
float normL1_(const float* a, const float* b, int n)
{
int j = 0; float d = 0.f;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
{
v_d0 += v_absdiff(vx_load(a + j), vx_load(b + j));
v_d1 += v_absdiff(vx_load(a + j + v_float32::nlanes), vx_load(b + j + v_float32::nlanes));
v_d2 += v_absdiff(vx_load(a + j + 2 * v_float32::nlanes), vx_load(b + j + 2 * v_float32::nlanes));
v_d3 += v_absdiff(vx_load(a + j + 3 * v_float32::nlanes), vx_load(b + j + 3 * v_float32::nlanes));
v_d0 = v_add(v_d0, v_absdiff(vx_load(a + j), vx_load(b + j)));
v_d1 = v_add(v_d1, v_absdiff(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes())));
v_d2 = v_add(v_d2, v_absdiff(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes())));
v_d3 = v_add(v_d3, v_absdiff(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes())));
}
d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
#endif
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);
@ -193,12 +193,12 @@ float normL1_(const float* a, const float* b, int n)
int normL1_(const uchar* a, const uchar* b, int n)
{
int j = 0, d = 0;
#if CV_SIMD
for (; j <= n - 4 * v_uint8::nlanes; j += 4 * v_uint8::nlanes)
#if (CV_SIMD || CV_SIMD_SCALABLE)
for (; j <= n - 4 * VTraits<v_uint8>::vlanes(); j += 4 * VTraits<v_uint8>::vlanes())
d += v_reduce_sad(vx_load(a + j), vx_load(b + j)) +
v_reduce_sad(vx_load(a + j + v_uint8::nlanes), vx_load(b + j + v_uint8::nlanes)) +
v_reduce_sad(vx_load(a + j + 2 * v_uint8::nlanes), vx_load(b + j + 2 * v_uint8::nlanes)) +
v_reduce_sad(vx_load(a + j + 3 * v_uint8::nlanes), vx_load(b + j + 3 * v_uint8::nlanes));
v_reduce_sad(vx_load(a + j + VTraits<v_uint8>::vlanes()), vx_load(b + j + VTraits<v_uint8>::vlanes())) +
v_reduce_sad(vx_load(a + j + 2 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 2 * VTraits<v_uint8>::vlanes())) +
v_reduce_sad(vx_load(a + j + 3 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 3 * VTraits<v_uint8>::vlanes()));
#endif
for( ; j < n; j++ )
d += std::abs(a[j] - b[j]);

@ -322,16 +322,20 @@ int decodeSimpleFormat( const char* dt )
}
#if defined __i386__ || defined(_M_IX86) || defined __x86_64__ || defined(_M_X64)
#define CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS 1
#if defined __i386__ || defined(_M_IX86) || defined __x86_64__ || defined(_M_X64) || \
(defined (__LITTLE_ENDIAN__) && __LITTLE_ENDIAN__)
#define CV_LITTLE_ENDIAN_MEM_ACCESS 1
#else
#define CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS 0
#define CV_LITTLE_ENDIAN_MEM_ACCESS 0
#endif
static inline int readInt(const uchar* p)
{
#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
return *(const int*)p;
// On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
#if CV_LITTLE_ENDIAN_MEM_ACCESS
int val;
memcpy(&val, p, sizeof(val));
return val;
#else
int val = (int)(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
return val;
@ -340,8 +344,11 @@ static inline int readInt(const uchar* p)
static inline double readReal(const uchar* p)
{
#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
return *(const double*)p;
// On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
#if CV_LITTLE_ENDIAN_MEM_ACCESS
double val;
memcpy(&val, p, sizeof(val));
return val;
#else
unsigned val0 = (unsigned)(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
unsigned val1 = (unsigned)(p[4] | (p[5] << 8) | (p[6] << 16) | (p[7] << 24));
@ -353,9 +360,9 @@ static inline double readReal(const uchar* p)
static inline void writeInt(uchar* p, int ival)
{
#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
int* ip = (int*)p;
*ip = ival;
// On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
#if CV_LITTLE_ENDIAN_MEM_ACCESS
memcpy(p, &ival, sizeof(ival));
#else
p[0] = (uchar)ival;
p[1] = (uchar)(ival >> 8);
@ -366,9 +373,9 @@ static inline void writeInt(uchar* p, int ival)
static inline void writeReal(uchar* p, double fval)
{
#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
double* fp = (double*)p;
*fp = fval;
// On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
#if CV_LITTLE_ENDIAN_MEM_ACCESS
memcpy(p, &fval, sizeof(fval));
#else
Cv64suf v;
v.f = fval;

@ -308,8 +308,8 @@ public:
if( !multiline )
{
ptr = fs->resizeWriteBuffer( ptr, len + 9 );
sprintf( ptr, "<!-- %s -->", comment );
ptr = fs->resizeWriteBuffer( ptr, len + 5+4+1 );
snprintf( ptr, len + 5+4+1, "<!-- %s -->", comment );
len = (int)strlen(ptr);
}
else
@ -344,7 +344,7 @@ public:
fs->setBufferPtr(ptr);
ptr = fs->flush();
}
sprintf( ptr, "-->" );
strcpy( ptr, "-->" );
fs->setBufferPtr(ptr + 3);
fs->flush();
}

@ -15,12 +15,12 @@ void split64s(const int64* src, int64** dst, int len, int cn);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
// see the comments for vecmerge_ in merge.cpp
template<typename T, typename VecT> static void
vecsplit_( const T* src, T** dst, int len, int cn )
{
const int VECSZ = VecT::nlanes;
const int VECSZ = VTraits<VecT>::vlanes();
int i, i0 = 0;
T* dst0 = dst[0];
T* dst1 = dst[1];
@ -177,8 +177,8 @@ split_( const T* src, T** dst, int len, int cn )
void split8u(const uchar* src, uchar** dst, int len, int cn )
{
CV_INSTRUMENT_REGION();
#if CV_SIMD
if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
vecsplit_<uchar, v_uint8>(src, dst, len, cn);
else
#endif
@ -188,8 +188,8 @@ void split8u(const uchar* src, uchar** dst, int len, int cn )
void split16u(const ushort* src, ushort** dst, int len, int cn )
{
CV_INSTRUMENT_REGION();
#if CV_SIMD
if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
vecsplit_<ushort, v_uint16>(src, dst, len, cn);
else
#endif
@ -199,8 +199,8 @@ void split16u(const ushort* src, ushort** dst, int len, int cn )
void split32s(const int* src, int** dst, int len, int cn )
{
CV_INSTRUMENT_REGION();
#if CV_SIMD
if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( len >= VTraits<v_uint32>::vlanes() && 2 <= cn && cn <= 4 )
vecsplit_<int, v_int32>(src, dst, len, cn);
else
#endif
@ -210,8 +210,8 @@ void split32s(const int* src, int** dst, int len, int cn )
void split64s(const int64* src, int64** dst, int len, int cn )
{
CV_INSTRUMENT_REGION();
#if CV_SIMD
if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
#if (CV_SIMD || CV_SIMD_SCALABLE)
if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
vecsplit_<int64, v_int64>(src, dst, len, cn);
else
#endif

@ -33,11 +33,11 @@ int normHamming(const uchar* a, int n)
int i = 0;
int result = 0;
#if CV_SIMD && CV_SIMD_WIDTH > 16
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_uint64 t = vx_setzero_u64();
for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
t = v_add(t, v_popcount(v_reinterpret_as_u64(vx_load(a + i))));
result = (int)v_reduce_sum(t);
vx_cleanup();
}
@ -56,13 +56,6 @@ int normHamming(const uchar* a, int n)
result += CV_POPCNT_U32(*(uint*)(a + i));
}
}
#elif CV_SIMD
{
v_uint64x2 t = v_setzero_u64();
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
result += (int)v_reduce_sum(t);
}
#endif
#if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4)
@ -85,11 +78,11 @@ int normHamming(const uchar* a, const uchar* b, int n)
int i = 0;
int result = 0;
#if CV_SIMD && CV_SIMD_WIDTH > 16
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
v_uint64 t = vx_setzero_u64();
for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
t = v_add(t, v_popcount(v_reinterpret_as_u64(v_xor(vx_load(a + i), vx_load(b + i)))));
result += (int)v_reduce_sum(t);
}
#endif
@ -107,13 +100,6 @@ int normHamming(const uchar* a, const uchar* b, int n)
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
}
}
#elif CV_SIMD
{
v_uint64x2 t = v_setzero_u64();
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
t += v_popcount(v_reinterpret_as_u64(v_load(a + i) ^ v_load(b + i)));
result += (int)v_reduce_sum(t);
}
#endif
#if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4)

@ -10,11 +10,13 @@
#include "sum.simd.hpp"
#include "sum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
#ifndef OPENCV_IPP_SUM
#undef HAVE_IPP
#undef CV_IPP_RUN_FAST
#define CV_IPP_RUN_FAST(f, ...)
#undef CV_IPP_RUN
#define CV_IPP_RUN(c, f, ...)
#endif // OPENCV_IPP_SUM
namespace cv
{

@ -22,7 +22,7 @@ struct Sum_SIMD
}
};
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
template <>
struct Sum_SIMD<uchar, int>
@ -36,41 +36,41 @@ struct Sum_SIMD<uchar, int>
int x = 0;
v_uint32 v_sum = vx_setzero_u32();
int len0 = len & -v_uint8::nlanes;
int len0 = len & -VTraits<v_uint8>::vlanes();
while (x < len0)
{
const int len_tmp = min(x + 256*v_uint16::nlanes, len0);
const int len_tmp = min(x + 256*VTraits<v_uint16>::vlanes(), len0);
v_uint16 v_sum16 = vx_setzero_u16();
for (; x < len_tmp; x += v_uint8::nlanes)
for (; x < len_tmp; x += VTraits<v_uint8>::vlanes())
{
v_uint16 v_src0, v_src1;
v_expand(vx_load(src0 + x), v_src0, v_src1);
v_sum16 += v_src0 + v_src1;
v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
}
v_uint32 v_half0, v_half1;
v_expand(v_sum16, v_half0, v_half1);
v_sum += v_half0 + v_half1;
v_sum = v_add(v_sum, v_add(v_half0, v_half1));
}
if (x <= len - v_uint16::nlanes)
if (x <= len - VTraits<v_uint16>::vlanes())
{
v_uint32 v_half0, v_half1;
v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
v_sum += v_half0 + v_half1;
x += v_uint16::nlanes;
v_sum = v_add(v_sum, v_add(v_half0, v_half1));
x += VTraits<v_uint16>::vlanes();
}
if (x <= len - v_uint32::nlanes)
if (x <= len - VTraits<v_uint32>::vlanes())
{
v_sum += vx_load_expand_q(src0 + x);
x += v_uint32::nlanes;
v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
x += VTraits<v_uint32>::vlanes();
}
if (cn == 1)
*dst += v_reduce_sum(v_sum);
else
{
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
v_store_aligned(ar, v_sum);
for (int i = 0; i < v_uint32::nlanes; ++i)
for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
dst[i % cn] += ar[i];
}
v_cleanup();
@ -91,41 +91,41 @@ struct Sum_SIMD<schar, int>
int x = 0;
v_int32 v_sum = vx_setzero_s32();
int len0 = len & -v_int8::nlanes;
int len0 = len & -VTraits<v_int8>::vlanes();
while (x < len0)
{
const int len_tmp = min(x + 256*v_int16::nlanes, len0);
const int len_tmp = min(x + 256*VTraits<v_int16>::vlanes(), len0);
v_int16 v_sum16 = vx_setzero_s16();
for (; x < len_tmp; x += v_int8::nlanes)
for (; x < len_tmp; x += VTraits<v_int8>::vlanes())
{
v_int16 v_src0, v_src1;
v_expand(vx_load(src0 + x), v_src0, v_src1);
v_sum16 += v_src0 + v_src1;
v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
}
v_int32 v_half0, v_half1;
v_expand(v_sum16, v_half0, v_half1);
v_sum += v_half0 + v_half1;
v_sum = v_add(v_sum, v_add(v_half0, v_half1));
}
if (x <= len - v_int16::nlanes)
if (x <= len - VTraits<v_int16>::vlanes())
{
v_int32 v_half0, v_half1;
v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
v_sum += v_half0 + v_half1;
x += v_int16::nlanes;
v_sum = v_add(v_sum, v_add(v_half0, v_half1));
x += VTraits<v_int16>::vlanes();
}
if (x <= len - v_int32::nlanes)
if (x <= len - VTraits<v_int32>::vlanes())
{
v_sum += vx_load_expand_q(src0 + x);
x += v_int32::nlanes;
v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
x += VTraits<v_int32>::vlanes();
}
if (cn == 1)
*dst += v_reduce_sum(v_sum);
else
{
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
v_store_aligned(ar, v_sum);
for (int i = 0; i < v_int32::nlanes; ++i)
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
dst[i % cn] += ar[i];
}
v_cleanup();
@ -146,25 +146,25 @@ struct Sum_SIMD<ushort, int>
int x = 0;
v_uint32 v_sum = vx_setzero_u32();
for (; x <= len - v_uint16::nlanes; x += v_uint16::nlanes)
for (; x <= len - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
{
v_uint32 v_src0, v_src1;
v_expand(vx_load(src0 + x), v_src0, v_src1);
v_sum += v_src0 + v_src1;
v_sum = v_add(v_sum, v_add(v_src0, v_src1));
}
if (x <= len - v_uint32::nlanes)
if (x <= len - VTraits<v_uint32>::vlanes())
{
v_sum += vx_load_expand(src0 + x);
x += v_uint32::nlanes;
v_sum = v_add(v_sum, vx_load_expand(src0 + x));
x += VTraits<v_uint32>::vlanes();
}
if (cn == 1)
*dst += v_reduce_sum(v_sum);
else
{
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
v_store_aligned(ar, v_sum);
for (int i = 0; i < v_uint32::nlanes; ++i)
for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
dst[i % cn] += ar[i];
}
v_cleanup();
@ -185,25 +185,25 @@ struct Sum_SIMD<short, int>
int x = 0;
v_int32 v_sum = vx_setzero_s32();
for (; x <= len - v_int16::nlanes; x += v_int16::nlanes)
for (; x <= len - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
{
v_int32 v_src0, v_src1;
v_expand(vx_load(src0 + x), v_src0, v_src1);
v_sum += v_src0 + v_src1;
v_sum = v_add(v_sum, v_add(v_src0, v_src1));
}
if (x <= len - v_int32::nlanes)
if (x <= len - VTraits<v_int32>::vlanes())
{
v_sum += vx_load_expand(src0 + x);
x += v_int32::nlanes;
v_sum = v_add(v_sum, vx_load_expand(src0 + x));
x += VTraits<v_int32>::vlanes();
}
if (cn == 1)
*dst += v_reduce_sum(v_sum);
else
{
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
v_store_aligned(ar, v_sum);
for (int i = 0; i < v_int32::nlanes; ++i)
for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
dst[i % cn] += ar[i];
}
v_cleanup();
@ -212,7 +212,7 @@ struct Sum_SIMD<short, int>
}
};
#if CV_SIMD_64F
#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
template <>
struct Sum_SIMD<int, double>
{
@ -226,24 +226,24 @@ struct Sum_SIMD<int, double>
v_float64 v_sum0 = vx_setzero_f64();
v_float64 v_sum1 = vx_setzero_f64();
for (; x <= len - 2 * v_int32::nlanes; x += 2 * v_int32::nlanes)
for (; x <= len - 2 * VTraits<v_int32>::vlanes(); x += 2 * VTraits<v_int32>::vlanes())
{
v_int32 v_src0 = vx_load(src0 + x);
v_int32 v_src1 = vx_load(src0 + x + v_int32::nlanes);
v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
v_int32 v_src1 = vx_load(src0 + x + VTraits<v_int32>::vlanes());
v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
}
#if CV_SIMD256 || CV_SIMD512
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
v_store_aligned(ar, v_sum0 + v_sum1);
for (int i = 0; i < v_float64::nlanes; ++i)
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
v_store_aligned(ar, v_add(v_sum0, v_sum1));
for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
dst[i % cn] += ar[i];
#else
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
v_store_aligned(ar, v_sum0);
v_store_aligned(ar + v_float64::nlanes, v_sum1);
for (int i = 0; i < 2 * v_float64::nlanes; ++i)
v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
dst[i % cn] += ar[i];
#endif
v_cleanup();
@ -265,24 +265,24 @@ struct Sum_SIMD<float, double>
v_float64 v_sum0 = vx_setzero_f64();
v_float64 v_sum1 = vx_setzero_f64();
for (; x <= len - 2 * v_float32::nlanes; x += 2 * v_float32::nlanes)
for (; x <= len - 2 * VTraits<v_float32>::vlanes(); x += 2 * VTraits<v_float32>::vlanes())
{
v_float32 v_src0 = vx_load(src0 + x);
v_float32 v_src1 = vx_load(src0 + x + v_float32::nlanes);
v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
v_float32 v_src1 = vx_load(src0 + x + VTraits<v_float32>::vlanes());
v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
}
#if CV_SIMD256 || CV_SIMD512
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
v_store_aligned(ar, v_sum0 + v_sum1);
for (int i = 0; i < v_float64::nlanes; ++i)
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
v_store_aligned(ar, v_add(v_sum0, v_sum1));
for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
dst[i % cn] += ar[i];
#else
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
v_store_aligned(ar, v_sum0);
v_store_aligned(ar + v_float64::nlanes, v_sum1);
for (int i = 0; i < 2 * v_float64::nlanes; ++i)
v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
dst[i % cn] += ar[i];
#endif
v_cleanup();

@ -34,7 +34,7 @@
#include <errno.h>
#include <io.h>
#include <stdio.h>
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@ -343,7 +343,7 @@ private:
Impl& operator=(const Impl&); // disabled
};
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
struct FileLock::Impl
{
@ -457,7 +457,7 @@ cv::String getCacheDirectory(const char* sub_directory_name, const char* configu
default_cache_path = "/tmp/";
CV_LOG_WARNING(NULL, "Using world accessible cache directory. This may be not secure: " << default_cache_path);
}
#elif defined __linux__ || defined __HAIKU__ || defined __FreeBSD__
#elif defined __linux__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
// https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
if (default_cache_path.empty())
{

@ -2292,6 +2292,139 @@ INSTANTIATE_TEST_CASE_P(Arithm, FlipND, testing::Combine(
testing::Values(perf::MatType(CV_8UC1), CV_32FC1)
));
TEST(BroadcastTo, basic) {
std::vector<int> shape_src{2, 1};
std::vector<int> data_src{1, 2};
Mat src(static_cast<int>(shape_src.size()), shape_src.data(), CV_32SC1, data_src.data());
auto get_index = [](const std::vector<int>& shape, size_t cnt) {
std::vector<int> index(shape.size());
size_t t = cnt;
for (int i = static_cast<int>(shape.size() - 1); i >= 0; --i) {
size_t idx = t / shape[i];
index[i] = static_cast<int>(t - idx * shape[i]);
t = idx;
}
return index;
};
auto fn_verify = [&get_index](const Mat& ref, const Mat& res) {
// check type
EXPECT_EQ(ref.type(), res.type());
// check shape
EXPECT_EQ(ref.dims, res.dims);
for (int i = 0; i < ref.dims; ++i) {
EXPECT_EQ(ref.size[i], res.size[i]);
}
// check value
std::vector<int> shape{ref.size.p, ref.size.p + ref.dims};
for (size_t i = 0; i < ref.total(); ++i) {
auto index = get_index(shape, i);
switch (ref.type()) {
case CV_32SC1: {
ASSERT_EQ(ref.at<int>(index.data()), res.at<int>(index.data()));
} break;
case CV_8UC1: {
ASSERT_EQ(ref.at<uint8_t>(index.data()), res.at<uint8_t>(index.data()));
} break;
case CV_32FC1: {
ASSERT_EQ(ref.at<float>(index.data()), res.at<float>(index.data()));
} break;
default: FAIL() << "Unsupported type: " << ref.type();
}
}
};
{
std::vector<int> shape{4, 2, 3};
std::vector<int> data_ref{
1, 1, 1, // [0, 0, :]
2, 2, 2, // [0, 1, :]
1, 1, 1, // [1, 0, :]
2, 2, 2, // [1, 1, :]
1, 1, 1, // [2, 0, :]
2, 2, 2, // [2, 1, :]
1, 1, 1, // [3, 0, :]
2, 2, 2 // [3, 1, :]
};
Mat ref(static_cast<int>(shape.size()), shape.data(), src.type(), data_ref.data());
Mat dst;
broadcast(src, shape, dst);
fn_verify(ref, dst);
}
{
Mat _src;
src.convertTo(_src, CV_8U);
std::vector<int> shape{4, 2, 3};
std::vector<uint8_t> data_ref{
1, 1, 1, // [0, 0, :]
2, 2, 2, // [0, 1, :]
1, 1, 1, // [1, 0, :]
2, 2, 2, // [1, 1, :]
1, 1, 1, // [2, 0, :]
2, 2, 2, // [2, 1, :]
1, 1, 1, // [3, 0, :]
2, 2, 2 // [3, 1, :]
};
Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
Mat dst;
broadcast(_src, shape, dst);
fn_verify(ref, dst);
}
{
Mat _src;
src.convertTo(_src, CV_32F);
std::vector<int> shape{1, 1, 2, 1}; // {2, 1}
std::vector<float> data_ref{
1.f, // [0, 0, 0, 0]
2.f, // [0, 0, 1, 0]
};
Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
Mat dst;
broadcast(_src, shape, dst);
fn_verify(ref, dst);
}
{
std::vector<int> _shape_src{2, 3, 4};
std::vector<float> _data_src{
1.f, 2.f, 3.f, 4.f, // [0, 0, :]
2.f, 3.f, 4.f, 5.f, // [0, 1, :]
3.f, 4.f, 5.f, 6.f, // [0, 2, :]
4.f, 5.f, 6.f, 7.f, // [1, 0, :]
5.f, 6.f, 7.f, 8.f, // [1, 1, :]
6.f, 7.f, 8.f, 9.f, // [1, 2, :]
};
Mat _src(static_cast<int>(_shape_src.size()), _shape_src.data(), CV_32FC1, _data_src.data());
std::vector<int> shape{2, 1, 2, 3, 4};
std::vector<float> data_ref{
1.f, 2.f, 3.f, 4.f, // [0, 0, 0, 0, :]
2.f, 3.f, 4.f, 5.f, // [0, 0, 0, 1, :]
3.f, 4.f, 5.f, 6.f, // [0, 0, 0, 2, :]
4.f, 5.f, 6.f, 7.f, // [0, 0, 1, 0, :]
5.f, 6.f, 7.f, 8.f, // [0, 0, 1, 1, :]
6.f, 7.f, 8.f, 9.f, // [0, 0, 1, 2, :]
1.f, 2.f, 3.f, 4.f, // [1, 0, 0, 0, :]
2.f, 3.f, 4.f, 5.f, // [1, 0, 0, 1, :]
3.f, 4.f, 5.f, 6.f, // [1, 0, 0, 2, :]
4.f, 5.f, 6.f, 7.f, // [1, 0, 1, 0, :]
5.f, 6.f, 7.f, 8.f, // [1, 0, 1, 1, :]
6.f, 7.f, 8.f, 9.f, // [1, 0, 1, 2, :]
};
Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
Mat dst;
broadcast(_src, shape, dst);
fn_verify(ref, dst);
}
}
TEST(Core_minMaxIdx, regression_9207_2)
{
const int rows = 13;

@ -259,7 +259,7 @@ TEST_P (CountNonZeroND, ndim)
const int ONE_SIZE = 5;
vector<int> sizes(dims);
fill(sizes.begin(), sizes.end(), ONE_SIZE);
std::fill(sizes.begin(), sizes.end(), ONE_SIZE);
Mat data(sizes, CV_MAKETYPE(type, 1));
data = 0;

@ -1475,12 +1475,15 @@ template<typename R> struct TheTest
TheTest & test_float_math()
{
typedef typename V_RegTraits<R>::round_reg Ri;
Data<R> data1, data2, data3;
Data<R> data1, data1_border, data2, data3;
// See https://github.com/opencv/opencv/issues/24213
data1_border *= 0.5;
data1 *= 1.1;
data2 += 10;
R a1 = data1, a2 = data2, a3 = data3;
R a1 = data1, a1_border = data1_border, a2 = data2, a3 = data3;
Data<Ri> resB = v_round(a1),
resB_border = v_round(a1_border),
resC = v_trunc(a1),
resD = v_floor(a1),
resE = v_ceil(a1);
@ -1493,6 +1496,7 @@ template<typename R> struct TheTest
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(cvRound(data1[i]), resB[i]);
EXPECT_EQ(cvRound(data1_border[i]), resB_border[i]);
EXPECT_EQ((typename VTraits<Ri>::lane_type)data1[i], resC[i]);
EXPECT_EQ(cvFloor(data1[i]), resD[i]);
EXPECT_EQ(cvCeil(data1[i]), resE[i]);

@ -58,11 +58,6 @@ endif()
ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
if(HAVE_TENGINE)
ocv_target_compile_definitions(${the_module} PRIVATE "HAVE_TENGINE=1")
endif()
if(MSVC)
add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
@ -172,11 +167,6 @@ else()
set(sources_options ${sources_options} EXCLUDE_CUDA)
endif()
if(HAVE_TENGINE)
list(APPEND include_dirs ${TENGINE_INCLUDE_DIRS})
list(APPEND libs -Wl,--whole-archive ${TENGINE_LIBRARIES} -Wl,--no-whole-archive)
endif()
if(HAVE_TIMVX)
list(APPEND include_dirs ${TIMVX_INCLUDE_DIR})
list(APPEND libs -Wl,--whole-archive ${TIMVX_LIBRARY} -Wl,--no-whole-archive)
@ -237,6 +227,10 @@ if(TARGET ocv.3rdparty.openvino AND OPENCV_DNN_OPENVINO)
endif()
endif()
set(OPENCV_DNN_BACKEND_DEFAULT "" CACHE STRING "Default backend used by the DNN module (DNN_BACKEND_OPENCV if empty)")
if(OPENCV_DNN_BACKEND_DEFAULT)
ocv_append_source_file_compile_definitions("${CMAKE_CURRENT_LIST_DIR}/src/dnn_params.cpp" "OPENCV_DNN_BACKEND_DEFAULT=${OPENCV_DNN_BACKEND_DEFAULT}")
endif()
ocv_install_used_external_targets(${libs} ${dnn_runtime_libs})

@ -69,9 +69,7 @@ CV__DNN_INLINE_NS_BEGIN
*/
enum Backend
{
//! DNN_BACKEND_DEFAULT equals to DNN_BACKEND_INFERENCE_ENGINE if
//! OpenCV is built with Intel OpenVINO or
//! DNN_BACKEND_OPENCV otherwise.
//! DNN_BACKEND_DEFAULT equals to OPENCV_DNN_BACKEND_DEFAULT, which can be defined using CMake or a configuration parameter
DNN_BACKEND_DEFAULT = 0,
DNN_BACKEND_HALIDE,
DNN_BACKEND_INFERENCE_ENGINE, //!< Intel OpenVINO computational backend
@ -688,9 +686,6 @@ CV__DNN_INLINE_NS_BEGIN
* @brief Ask network to use specific computation backend where it supported.
* @param[in] backendId backend identifier.
* @see Backend
*
* If OpenCV is compiled with Intel's Inference Engine library, DNN_BACKEND_DEFAULT
* means DNN_BACKEND_INFERENCE_ENGINE. Otherwise it equals to DNN_BACKEND_OPENCV.
*/
CV_WRAP void setPreferableBackend(int backendId);

@ -191,10 +191,10 @@ class dnn_test(NewOpenCVTests):
def test_model(self):
img_path = self.find_dnn_file("dnn/street.png")
weights = self.find_dnn_file("dnn/MobileNetSSD_deploy.caffemodel", required=False)
config = self.find_dnn_file("dnn/MobileNetSSD_deploy.prototxt", required=False)
weights = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", required=False)
config = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", required=False)
if weights is None or config is None:
raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy_19e3ec3.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
frame = cv.imread(img_path)
model = cv.dnn_DetectionModel(weights, config)

@ -101,8 +101,8 @@ PERF_TEST(SqueezeNet_v1_1_caffe, CaffePerfTest)
PERF_TEST(MobileNet_SSD, CaffePerfTest)
{
caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy.prototxt",
"dnn/MobileNetSSD_deploy.caffemodel");
caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
"dnn/MobileNetSSD_deploy_19e3ec3.caffemodel");
TEST_CYCLE() net->Forward();
SANITY_CHECK_NOTHING();
}

@ -678,7 +678,6 @@ PERF_TEST_P_(Layer_FullyConnected, fc)
lp.set("axis", input.dims - 1);
lp.set("is_matmul", weights.dims > 2);
lp.set("bias_term", false);
lp.set("transB", true);
lp.set("num_output", (int)weights.total(0, weights.dims - 1));
lp.blobs.resize(1, weights);

@ -141,7 +141,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
{
if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException("");
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", "",
Mat(cv::Size(300, 300), CV_32FC3));
}

@ -36,7 +36,11 @@ bool getParam_DNN_OPENCL_ALLOW_ALL_DEVICES()
int getParam_DNN_BACKEND_DEFAULT()
{
static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
#ifdef OPENCV_DNN_BACKEND_DEFAULT
(size_t)OPENCV_DNN_BACKEND_DEFAULT
#else
(size_t)DNN_BACKEND_OPENCV
#endif
);
return PARAM_DNN_BACKEND_DEFAULT;
}

@ -5,6 +5,7 @@
#include "precomp.hpp"
#include <opencv2/imgproc.hpp>
#include <opencv2/core/utils/logger.hpp>
namespace cv {
@ -100,13 +101,27 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
images_.getMatVector(images);
CV_Assert(!images.empty());
if (param.ddepth == CV_8U)
{
CV_Assert(param.scalefactor == Scalar::all(1.0) && "Scaling is not supported for CV_8U blob depth");
CV_Assert(param.mean == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
}
int nch = images[0].channels();
Scalar scalefactor = param.scalefactor;
Scalar mean = param.mean;
if (param.ddepth == CV_8U)
if (param.swapRB)
{
CV_Assert(scalefactor == Scalar::all(1.0) && "Scaling is not supported for CV_8U blob depth");
CV_Assert(param.mean == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
if (nch > 2)
{
std::swap(mean[0], mean[2]);
std::swap(scalefactor[0], scalefactor[2]);
}
else
{
CV_LOG_WARNING(NULL, "Red/blue color swapping requires at least three image channels.");
}
}
for (size_t i = 0; i < images.size(); i++)
@ -126,34 +141,26 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
size);
images[i] = images[i](crop);
}
else if (param.paddingmode == DNN_PMODE_LETTERBOX)
{
float resizeFactor = std::min(size.width / (float)imgSize.width,
size.height / (float)imgSize.height);
int rh = int(imgSize.height * resizeFactor);
int rw = int(imgSize.width * resizeFactor);
resize(images[i], images[i], Size(rw, rh), INTER_LINEAR);
int top = (size.height - rh)/2;
int bottom = size.height - top - rh;
int left = (size.width - rw)/2;
int right = size.width - left - rw;
copyMakeBorder(images[i], images[i], top, bottom, left, right, BORDER_CONSTANT);
}
else
{
if (param.paddingmode == DNN_PMODE_LETTERBOX)
{
float resizeFactor = std::min(size.width / (float)imgSize.width,
size.height / (float)imgSize.height);
int rh = int(imgSize.height * resizeFactor);
int rw = int(imgSize.width * resizeFactor);
resize(images[i], images[i], Size(rw, rh), INTER_LINEAR);
int top = (size.height - rh)/2;
int bottom = size.height - top - rh;
int left = (size.width - rw)/2;
int right = size.width - left - rw;
copyMakeBorder(images[i], images[i], top, bottom, left, right, BORDER_CONSTANT);
}
else
resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
}
}
Scalar mean = param.mean;
if (param.swapRB)
{
std::swap(mean[0], mean[2]);
std::swap(scalefactor[0], scalefactor[2]);
}
if (images[i].depth() == CV_8U && param.ddepth == CV_32F)
images[i].convertTo(images[i], CV_32F);
@ -220,18 +227,22 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
CV_Assert(image.depth() == blob_.depth());
CV_Assert(image.channels() == image0.channels());
CV_Assert(image.size() == image0.size());
if (param.swapRB)
if (nch > 2 && param.swapRB)
{
Mat tmpRB;
cvtColor(image, tmpRB, COLOR_BGR2RGB);
tmpRB.copyTo(Mat(tmpRB.rows, tmpRB.cols, subMatType, blob.ptr((int)i, 0)));
}
else
{
image.copyTo(Mat(image.rows, image.cols, subMatType, blob.ptr((int)i, 0)));
}
}
}
else
{
CV_Error(Error::StsUnsupportedFormat, "Unsupported data layout in blobFromImagesWithParams function.");
}
}
void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)

@ -383,11 +383,17 @@ public:
#endif // OpenVINO >= 2022.1
InfEngineNgraphNode::InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node)
: BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {}
InfEngineNgraphNode::InfEngineNgraphNode(ngraph::Output<ngraph::Node>&& _node)
: BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {
CV_Assert(node.get_node());
CV_Assert(node.get_node_shared_ptr());
}
InfEngineNgraphNode::InfEngineNgraphNode(const std::shared_ptr<ngraph::Node>& _node)
: BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {}
InfEngineNgraphNode::InfEngineNgraphNode(const ngraph::Output<ngraph::Node>& _node)
: BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {
CV_Assert(node.get_node());
CV_Assert(node.get_node_shared_ptr());
}
InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& nodes,
Ptr<Layer>& cvLayer_, std::vector<Mat*>& inputs,
@ -420,7 +426,7 @@ InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& n
}
void InfEngineNgraphNode::setName(const std::string& name) {
node->set_friendly_name(name);
node.get_node()->set_friendly_name(name);
}
InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl)
@ -441,8 +447,7 @@ InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl, InferenceEn
void InfEngineNgraphNet::addOutput(const Ptr<InfEngineNgraphNode>& node)
{
CV_Assert(node);
CV_Assert(node->node);
const std::string& name = node->node->get_friendly_name();
const std::string& name = node->node.get_node()->get_friendly_name();
requestedOutputs.insert({name, node.get()});
}
@ -458,7 +463,7 @@ void InfEngineNgraphNet::createNet(Target targetId) {
CV_Assert(output_node_it->second);
auto out = std::make_shared<ngraph::op::Result>(output_node_it->second->node);
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
out->set_friendly_name(output_node_it->first + (output_node_it->second->node->get_output_size() == 1 ? "" : ".0"));
out->set_friendly_name(output_node_it->first + (output_node_it->second->node.get_node()->get_output_size() == 1 ? "" : ".0"));
#endif
outs.push_back(out);
}

@ -93,13 +93,13 @@ public:
std::vector<Mat*>& inputs, std::vector<Mat>& outputs,
std::vector<Mat>& internals);
InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node);
InfEngineNgraphNode(const std::shared_ptr<ngraph::Node>& _node);
InfEngineNgraphNode(ngraph::Output<ngraph::Node>&& _node);
InfEngineNgraphNode(const ngraph::Output<ngraph::Node>& _node);
void setName(const std::string& name);
// Inference Engine network object that allows to obtain the outputs of this layer.
std::shared_ptr<ngraph::Node> node;
ngraph::Output<ngraph::Node> node;
Ptr<InfEngineNgraphNet> net;
Ptr<dnn::Layer> cvLayer;
};

@ -457,7 +457,7 @@ public:
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
std::vector<size_t> shape(ieInpNode.get_shape().size(), 1);
shape[1] = weights_.total();
auto weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), weights_.data);
auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), bias_.data);

@ -148,7 +148,7 @@ public:
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
ngraph::OutputVector inp{ieInpNode};
auto blank = std::make_shared<ngraph::op::Concat>(inp, 0);
return Ptr<BackendNode>(new InfEngineNgraphNode(blank));

@ -392,7 +392,7 @@ public:
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
const int numDims = nodes[0].dynamicCast<InfEngineNgraphNode>()->node->get_shape().size();
const int numDims = nodes[0].dynamicCast<InfEngineNgraphNode>()->node.get_shape().size();
const int cAxis = normalize_axis(axis, numDims);
std::vector<size_t> maxDims(numDims, 0);
@ -403,7 +403,7 @@ public:
auto inp = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
inp_nodes.push_back(inp);
std::vector<size_t> inpShape = inp->get_shape();
std::vector<size_t> inpShape = inp.get_shape();
for (int i = 0; i < numDims; ++i)
maxDims[i] = std::max(maxDims[i], inpShape[i]);
}

@ -62,9 +62,6 @@
#include "opencl_kernels_dnn.hpp"
using namespace cv::dnn::ocl4dnn;
#endif
#ifdef HAVE_TENGINE
#include "../tengine4dnn/include/tengine_graph_convolution.hpp"
#endif
#ifdef HAVE_CUDA
#include "../cuda4dnn/primitives/convolution.hpp"
@ -267,10 +264,6 @@ public:
float power;
#endif
#ifdef HAVE_TENGINE
teng_graph_t tengine_graph;
#endif
#ifdef HAVE_CUDA
cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode;
cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
@ -289,20 +282,8 @@ public:
#ifdef HAVE_CUDA
cudaFusionMode = cuda4dnn::ConvolutionConfiguration::FusionMode::NONE;
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY;
#endif
#ifdef HAVE_TENGINE
tengine_graph=NULL;
#endif
}
#ifdef HAVE_TENGINE
~ConvolutionLayerImpl()
{
if(NULL != tengine_graph )
{
tengine_release(tengine_graph);
}
}
#endif
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
{
@ -466,13 +447,6 @@ public:
for(int i = 0; i < numOutput; i++ )
biasvec[i] = biasMat.at<float>(i);
}
#ifdef HAVE_TENGINE
if(NULL != tengine_graph )
{
tengine_release(tengine_graph);
tengine_graph = NULL ;
}
#endif
#ifdef HAVE_OPENCL
convolutionOp.release();
#endif
@ -848,13 +822,13 @@ public:
CV_Assert(!blobs.empty());
CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
std::vector<size_t> dims = ieInpNode->get_shape();
std::vector<size_t> dims = ieInpNode.get_shape();
CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
std::shared_ptr<ngraph::Node> ieWeights = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
ngraph::Output<ngraph::Node> ieWeights;
if (nodes.size() > 1)
CV_Assert(ieWeights); // dynamic_cast should not fail
ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
const int inpCn = dims[1];
const int inpGroupCn = nodes.size() > 1 ? ieWeights->get_shape()[1] : blobs[0].size[1];
const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
const int group = inpCn / inpGroupCn;
std::vector<size_t> kernel_shape;
@ -1095,7 +1069,7 @@ public:
config.pads = pads;
config.stride = stride;
config.dilation = dilation;
if (inputs[0].dims != 4 && inputs[0].dims != umat_blobs[0].dims)
if (inputs[0].dims != 4 && inputs[0].dims != (blobs.empty() ? umat_blobs[0].dims : blobs[0].dims))
{
static bool bypassCheck = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_CONVOLUTION_IGNORE_INPUT_DIMS_4_CHECK", false);
if (!bypassCheck)
@ -1107,7 +1081,7 @@ public:
return false;
}
}
config.group = inputs[0].size[1] / umat_blobs[0].size[1];
config.group = inputs[0].size[1] / (blobs.empty() ? umat_blobs[0].size[1] : blobs[0].size[1]);
if (config.group < 1) // config.group == 0 causes div by zero in ocl4dnn code
{
CV_LOG_WARNING(NULL, "DNN/OpenCL: Unsupported config.group=" << config.group
@ -1305,65 +1279,6 @@ public:
}
}
#ifdef HAVE_TENGINE
bool tengine_ret = false;
std::vector<Mat> teng_in, teng_out;
inputs_arr.getMatVector(teng_in);
outputs_arr.getMatVector(teng_out);
int inch = teng_in[0].size[1]; // inch
int in_h = teng_in[0].size[2]; // in_h
int in_w = teng_in[0].size[3]; // in_w
int out_b = teng_out[0].size[0]; // out batch size
int outch = teng_out[0].size[1]; // outch
int out_h = teng_out[0].size[2]; // out_h
int out_w = teng_out[0].size[3]; // out_w
float *input_ = teng_in[0].ptr<float>();
float *output_ = teng_out[0].ptr<float>();
float *kernel_ = weightsMat.ptr<float>();
float *teg_bias = &biasvec[0];
int nstripes = std::max(getNumThreads(), 1);
/* tengine_init will run when first time. */
if(NULL == tengine_graph)
{
// pads_begin: 0 - pad_top, 1 - pad_left
// pads_end: 0 - pad_bottom, 1 - pad_right
// pad_h0: pad_top, pad_h1: pad_bottom
// pad_w0: pad_left, pad_w1: pad_right
tengine_graph = tengine_init(name.c_str(), input_, inch, ngroups, in_h, in_w,
output_, out_b, outch, out_h, out_w,
kernel_, kernel_size.size(), kernel.height, kernel.width,
teg_bias, stride.height, stride.width,
pads_begin[0], pads_end[0], pads_begin[1], pads_end[1], dilation.height, dilation.width,
weightsMat.step1(), padMode, tengine_graph, nstripes);
// printf("Init(%s): input=%p(%d %d %d %d ),output=%p(%d %d %d %d ),kernel=%p(%ld %d %d ), bias=%p ,"
// "stride(%d %d), pad(%d %d %d %d), dilation(%d %d) ,weightsMat=%ld, padMode=%s ,tengine_graph = %p \n",
// name.c_str(),input_, inch, ngroups, in_h, in_w,
// output_, out_b, outch, out_h, out_w,
// kernel_, kernel_size.size(), kernel.height, kernel.width,
// teg_bias, stride.height, stride.width,
// pads_begin[0], pads_end[0], pads_begin[1], pads_end[1], dilation.height, dilation.width,
// weightsMat.step1(), padMode.c_str() ,tengine_graph);
}
if(NULL != tengine_graph)
{
tengine_ret = tengine_forward(tengine_graph);
}
/* activation */
if((true == tengine_ret) && activ )
{
int out_cstep = out_h * out_w; // out_cstep
ActivationLayer* activ_ = activ.get();
activ_->forwardSlice(output_, output_, out_cstep, out_cstep, 0, outch);
}
if(false == tengine_ret)
#endif
{
int nstripes = std::max(getNumThreads(), 1);
int conv_dim = CONV_2D;

@ -14,7 +14,7 @@
#define CONV_NR_FP32 28
// The FP16 can only be supported by ARM64 and with FP16 FMA supported.
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // check FP16 FMA.
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && CV_FP16 // check FP16 FMA.
#define CONV_ARM_FP16 1
#endif

@ -133,7 +133,7 @@ public:
auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
auto rois = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
auto rois_shape = rois->get_shape();
auto rois_shape = rois.get_shape();
std::vector<int64_t> dims(rois_shape.begin(), rois_shape.end()), offsets(4, 0);
offsets[3] = 2;
dims[3] = 7;

@ -490,7 +490,7 @@ struct ReLUFunctor : public BaseFunctor
#endif
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
if (slope) {
auto param = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &slope);
@ -674,7 +674,7 @@ struct ReLU6Functor : public BaseFunctor
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
return std::make_shared<ngraph::op::Clamp>(node, minValue, maxValue);
}
@ -796,7 +796,7 @@ struct BaseDefaultFunctor : public BaseFunctor
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
CV_Error(Error::StsNotImplemented, "");
}
@ -929,7 +929,7 @@ struct TanHFunctor : public BaseDefaultFunctor<TanHFunctor>
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
return std::make_shared<ngraph::op::Tanh>(node);
}
@ -998,7 +998,7 @@ struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
auto sigmoid = std::make_shared<ngraph::op::Sigmoid>(node);
return std::make_shared<ngraph::op::v1::Multiply>(node, sigmoid);
@ -1074,7 +1074,7 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
float one = 1.0f;
auto constant = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &one);
@ -1157,7 +1157,7 @@ struct SigmoidFunctor : public BaseDefaultFunctor<SigmoidFunctor>
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
return std::make_shared<ngraph::op::Sigmoid>(node);
}
@ -1237,7 +1237,7 @@ struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
return std::make_shared<ngraph::op::Elu>(node, alpha);
}
@ -1307,7 +1307,7 @@ struct AbsValFunctor : public BaseDefaultFunctor<AbsValFunctor>
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
float coeff = -0.999999f;
// float coeff = preferableTarget == DNN_TARGET_MYRIAD ? -0.999f : -0.999999f;
@ -1603,7 +1603,7 @@ struct SqrtFunctor : public BaseDefaultFunctor<SqrtFunctor>
#endif // HAVE_HALIDE
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
return std::make_shared<ngraph::op::v0::Sqrt>(node);
}
@ -2329,7 +2329,7 @@ struct PowerFunctor : public BaseFunctor
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
ngraph::Shape{1}, &scale);
@ -2439,7 +2439,7 @@ struct ExpFunctor : public BaseDefaultFunctor<ExpFunctor>
#endif // HAVE_HALIDE
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
ngraph::Shape{1}, &normScale);
@ -2598,7 +2598,7 @@ struct ChannelsPReLUFunctor : public BaseFunctor
#endif // HAVE_CANN
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
const size_t numChannels = scale.total();
auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numChannels}, scale.data);
@ -2678,7 +2678,7 @@ struct PReLUFunctor : public ChannelsPReLUFunctor
}
#ifdef HAVE_DNN_NGRAPH
std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
{
auto shape = getShape<size_t>(scale);
auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, scale.ptr<float>());

@ -896,12 +896,14 @@ public:
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
CV_Assert(nodes.size() >= 2);
auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
if (!coeffs.empty()) {
auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
}
std::shared_ptr<ngraph::Node> res;
for (size_t i = 1; i < nodes.size(); i++)
{
auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
@ -910,15 +912,16 @@ public:
next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
}
switch (op) {
case SUM: curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
case DIV: curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
case MAX: curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
case MIN: curr_node = std::make_shared<ngraph::op::v1::Minimum>(curr_node, next_node); break;
case SUM: res = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
case PROD: res = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
case DIV: res = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
case MAX: res = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
case MIN: res = std::make_shared<ngraph::op::v1::Minimum>(curr_node, next_node); break;
default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
}
curr_node = res;
}
return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
return Ptr<BackendNode>(new InfEngineNgraphNode(res));
}
#endif // HAVE_DNN_NGRAPH

@ -209,7 +209,7 @@ public:
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
std::vector<size_t> dims = ieInpNode->get_shape();
std::vector<size_t> dims = ieInpNode.get_shape();
int numAxes = dims.size();
int startAxis = normalize_axis(_startAxis, numAxes);

@ -803,7 +803,7 @@ public:
}
else
{
std::vector<int> shape(1 + normalize_axis(axis, ieInpNode->get_shape().size()), 0);
std::vector<int> shape(1 + normalize_axis(axis, ieInpNode.get_shape().size()), 0);
shape[shape.size() - 1] = -1;
auto inp = std::make_shared<ngraph::op::v1::Reshape>(
ieInpNode,

@ -480,7 +480,7 @@ public:
if (type != SPATIAL_NRM) {
axes = {1};
} else {
axes.resize(ieInpNode->get_shape().size() - 2);
axes.resize(ieInpNode.get_shape().size() - 2);
std::iota(axes.begin(), axes.end(), 2);
}
auto ngraph_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes.data());

@ -194,7 +194,7 @@ public:
std::vector<MatShape> inpShapes(nodes.size());
std::vector<MatShape> outShapes, internals;
for (int i = 0; i < nodes.size(); ++i) {
std::vector<size_t> shape = nodes[i].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
std::vector<size_t> shape = nodes[i].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
inpShapes[i] = std::vector<int>(shape.begin(), shape.end());
}
getMemoryShapes(inpShapes, 1, outShapes, internals);
@ -213,7 +213,7 @@ public:
std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{1}, &newShape),
true
);
if (indices->get_element_type() != ngraph::element::i32 && indices->get_element_type() != ngraph::element::i64) {
if (indices.get_element_type() != ngraph::element::i32 && indices.get_element_type() != ngraph::element::i64) {
indices = std::make_shared<ngraph::op::Convert>(indices, ngraph::element::i64);
}

@ -390,7 +390,7 @@ public:
auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
#else
int64_t start_axis = acrossChannels ? 1 : 2;
std::vector<int64_t> axes_v(ieInpNode->get_shape().size() - start_axis);
std::vector<int64_t> axes_v(ieInpNode.get_shape().size() - start_axis);
std::iota(axes_v.begin(), axes_v.end(), start_axis);
auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_v.size()}, axes_v.data());
auto mvn = std::make_shared<ngraph::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ngraph::op::MVNEpsMode::INSIDE_SQRT);

@ -900,12 +900,12 @@ public:
auto& inp0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
auto& inp1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
if (inp0->get_element_type() != inp1->get_element_type()) {
if (inp0.get_element_type() != inp1.get_element_type()) {
auto dtype = preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD ?
ngraph::element::f16 : ngraph::element::f32;
if (inp0->get_element_type() != dtype)
if (inp0.get_element_type() != dtype)
inp0 = std::make_shared<ngraph::op::v0::Convert>(inp0, dtype);
if (inp1->get_element_type() != dtype)
if (inp1.get_element_type() != dtype)
inp1 = std::make_shared<ngraph::op::v0::Convert>(inp1, dtype);
}

@ -273,21 +273,21 @@ public:
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
const size_t batch = ieInpNode->get_shape()[0];
const size_t numChannels = ieInpNode->get_shape()[1];
const size_t batch = ieInpNode.get_shape()[0];
const size_t numChannels = ieInpNode.get_shape()[1];
std::vector<int64_t> axes_data;
if (!acrossSpatial) {
axes_data.push_back(1);
} else {
axes_data.resize(ieInpNode->get_shape().size() - 1);
axes_data.resize(ieInpNode.get_shape().size() - 1);
std::iota(axes_data.begin(), axes_data.end(), 1);
}
auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_data.size()}, axes_data);
auto norm = std::make_shared<ngraph::op::v0::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
CV_Assert(blobs.empty() || numChannels == blobs[0].total());
std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
std::vector<size_t> shape(ieInpNode.get_shape().size(), 1);
shape[0] = blobs.empty() ? 1 : batch;
shape[1] = numChannels;
if (!blobs.empty())

@ -209,7 +209,8 @@ public:
#ifdef HAVE_INF_ENGINE
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
{
return type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin());
return type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin()) &&
(!computeMaxIdx || INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1));
}
#endif
if (backendId == DNN_BACKEND_OPENCV)
@ -600,7 +601,7 @@ public:
return Ptr<BackendNode>(new InfEngineNgraphNode(ave_pool));
}
else if (type == SUM) {
ngraph::Shape inpShape = ieInpNode->get_shape();
ngraph::Shape inpShape = ieInpNode.get_shape();
CV_Assert(inpShape.size() == 2 + kernel_size.size());
std::vector<int64_t> axes;
for (size_t i = 0; i < kernel_size.size(); i++)
@ -615,10 +616,14 @@ public:
else if (type == MAX) {
std::shared_ptr<ngraph::Node> max_pool;
if (computeMaxIdx) {
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
std::vector<size_t> dilations(kernel_size.size(), 1);
max_pool = std::make_shared<ngraph::op::v8::MaxPool>(ieInpNode, ngraph::Strides(strides), ngraph::Strides(dilations),
ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
rounding_type, pad_type);
#else
CV_Error(Error::StsNotImplemented, "OpenVINO MaxPool with indices");
#endif
} else {
max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),

@ -366,10 +366,10 @@ public:
auto& class_logits = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
auto& image_shape = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
CV_Assert_N(image_shape->get_shape().size() == 2, image_shape->get_shape().front() == 1);
CV_Assert_N(image_shape.get_shape().size() == 2, image_shape.get_shape().front() == 1);
auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
ngraph::Shape{1},
std::vector<int64_t>{(int64_t)image_shape->get_shape().back()});
std::vector<int64_t>{(int64_t)image_shape.get_shape().back()});
auto reshape = std::make_shared<ngraph::op::v1::Reshape>(image_shape, shape, true);
auto proposal = std::make_shared<ngraph::op::Proposal>(class_probs, class_logits, reshape, attr);

@ -466,7 +466,7 @@ public:
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto& input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
auto parent_shape = input->get_shape();
auto parent_shape = input.get_shape();
int64_t b = parent_shape[0];
int64_t h = parent_shape[1];
int64_t w = parent_shape[2];
@ -567,7 +567,7 @@ public:
int hNorm, wNorm;
if (nodes.size() > 1)
{
auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
hNorm = node_1_shape[2];
wNorm = node_1_shape[3];
}

@ -443,7 +443,7 @@ public:
std::vector<int64_t> shape = {outHeight, outWidth};
auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
auto& input_shape = ieInpNode->get_shape();
auto& input_shape = ieInpNode.get_shape();
CV_Assert_N(input_shape[2] != 0, input_shape[3] != 0);
std::vector<float> scales = {static_cast<float>(outHeight) / input_shape[2], static_cast<float>(outWidth) / input_shape[3]};
auto scales_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{2}, scales.data());

@ -331,34 +331,36 @@ public:
virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto ieInpNode0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
auto ieInpNode1 = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
ngraph::Output<ngraph::Node> ieInpNode1;
if (nodes.size() > 1)
ieInpNode1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
size_t numChannels = 1;
if (blobs.empty())
for (const size_t& dim : ieInpNode1->get_shape())
for (const size_t& dim : ieInpNode1.get_shape())
numChannels *= dim;
else
numChannels = blobs[0].total();
std::vector<size_t> shape(ieInpNode0->get_shape().size(), 1);
std::vector<size_t> shape(ieInpNode0.get_shape().size(), 1);
int cAxis = normalize_axis(axis, shape.size());
shape[cAxis] = numChannels;
auto node = ieInpNode0;
std::shared_ptr<ngraph::Node> node;
if (hasWeights)
{
auto weight = blobs.empty() ? ieInpNode1 :
ngraph::Output<ngraph::Node> weight = blobs.empty() ? ieInpNode1 :
std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
node = std::make_shared<ngraph::op::v1::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
node = std::make_shared<ngraph::op::v1::Multiply>(ieInpNode0, weight, ngraph::op::AutoBroadcastType::NUMPY);
#else
node = std::make_shared<ngraph::op::v0::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
node = std::make_shared<ngraph::op::v0::Multiply>(ieInpNode0, weight, ngraph::op::AutoBroadcastType::NUMPY);
#endif
}
if (hasBias || !hasWeights)
{
std::shared_ptr<ngraph::Node> bias;
ngraph::Output<ngraph::Node> bias;
if (hasBias)
{
bias = blobs.empty() ? ieInpNode1 :

@ -759,7 +759,7 @@ public:
{
CV_Assert_N(nodes.size() <= 2);
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
CV_Assert(finalSliceRanges[0].size() == ieInpNode->get_shape().size());
CV_Assert(finalSliceRanges[0].size() == ieInpNode.get_shape().size());
std::vector<int64_t> offsets, dims;
for (int i = 0; i < finalSliceRanges[0].size(); ++i)

@ -385,7 +385,7 @@ public:
const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{
auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
int axis = normalize_axis(axisRaw, ieInpNode->get_shape().size());
int axis = normalize_axis(axisRaw, ieInpNode.get_shape().size());
auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
if (logSoftMax)
return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));

@ -210,7 +210,7 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
if (!nextData->params.has("operation") || toLowerCase(nextData->params.get<String>("operation")) != "add")
{
CV_LOG_DEBUG(NULL, "DNN/CPU: fusion with NaryEltwise or Eltwise Layer operation is not supported: "
<< nextData->params.get<String>("operation"));
<< toLowerCase(nextData->params.get<String>("operation", "sum")));
break;
}

@ -252,7 +252,7 @@ void NetImplOpenVINO::addNgraphOutputs(LayerData& ld)
CV_Assert(!ieInpNode->net.empty());
if (layerNet != ieInpNode->net)
{
CV_LOG_DEBUG(NULL, "DNN/IE: pin output between subnets: " << ieInpNode->node->get_friendly_name());
CV_LOG_DEBUG(NULL, "DNN/IE: pin output between subnets: " << ieInpNode->node.get_node()->get_friendly_name());
ieInpNode->net->addOutput(ieInpNode);
}
}
@ -321,8 +321,10 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
return;
}
#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2022_1)
bool supportsCPUFallback = !isArmComputePlugin() && (preferableTarget == DNN_TARGET_CPU ||
openvino::checkTarget(DNN_TARGET_CPU));
#endif
// Build Inference Engine networks from sets of layers that support this
// backend. Split a whole model on several Inference Engine networks if
@ -341,6 +343,10 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
bool fused = ld.skip;
Ptr<Layer> layer = ld.layerInstance;
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
if (ld.id == 0)
continue;
#else
if (!fused && !layer->supportBackend(preferableBackend))
{
CV_LOG_DEBUG(NULL, "DNN/IE: NOT supported!");
@ -355,17 +361,6 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
}
}
// TODO: fix these workarounds
if (preferableTarget == DNN_TARGET_MYRIAD ||
preferableTarget == DNN_TARGET_HDDL ||
preferableTarget == DNN_TARGET_OPENCL ||
preferableTarget == DNN_TARGET_OPENCL_FP16)
customizable &= ld.type != "Concat";
if (preferableTarget == DNN_TARGET_OPENCL ||
preferableTarget == DNN_TARGET_OPENCL_FP16)
customizable &= ld.type != "Power";
if (preferableTarget == DNN_TARGET_OPENCL)
customizable &= ld.type != "Eltwise";
@ -390,6 +385,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
continue;
}
}
#endif
ld.skip = true; // Initially skip all Inference Engine supported layers.
// Create a new network if one of inputs from different Inference Engine graph.
@ -478,7 +474,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
int oid = ld.inputBlobsId[i].oid;
auto ieInpNode = inputNodes[i].dynamicCast<InfEngineNgraphNode>();
const auto& ngraph_input_node = ieInpNode->node;
const auto& ngraph_input_node = ieInpNode->node.get_node_shared_ptr();
CV_LOG_DEBUG(NULL, "DNN/IE: bind output port " << lid << ":" << oid << " (" << ngraph_input_node->get_friendly_name() << ":" << ngraph_input_node->get_type_info().name << ")");
if ((oid == 0 && ngraph_input_node->get_output_size() == 1) || lid == 0)
@ -498,10 +494,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
}
CV_CheckLT((size_t)oid, ngraph_input_node->get_output_size(), "");
#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
// FIXIT refactor ".initNgraph()" API to use Output<Node>
// WA: use Concat to emulate Identity operation with requested output port
auto oid_node = std::make_shared<ngraph::op::Concat>(ngraph::OutputVector { ngraph_input_node->output(oid) }, 0);
inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(oid_node));
inputNodes[i] = new InfEngineNgraphNode(ngraph_input_node->output(oid));
#elif INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_3)
inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid)));
#else
@ -556,6 +549,36 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
addNgraphOutputs(ld);
}
// User may choose to return only intermediate blobs but not network's result (see Test_TFLite.max_unpooling)
// Such layers should not be skipped when forwardLayer is called.
// Also, perform a sanity check that there is no double inferred networks (a single skip=false per unique net instance)
std::set<Ptr<InfEngineNgraphNet>> uniqueNets;
if (!blobsToKeep_.empty())
{
LayerPin latestLayerPin = getLatestLayerPin(blobsToKeep_);
for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
{
LayerData& ld = it->second;
auto iter = ld.backendNodes.find(preferableBackend);
if (iter == ld.backendNodes.end())
continue;
Ptr<BackendNode>& node = iter->second;
if (node.empty())
continue;
Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
if (ieNode.empty())
continue;
if (ld.id == latestLayerPin.lid) {
ld.skip = false;
uniqueNets.insert(ieNode->net);
break;
}
}
}
// Initialize all networks.
for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
{
@ -578,9 +601,15 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
{
ieNode->net->addOutput(ieNode);
ieNode->net->createNet((Target)preferableTarget);
ld.skip = false;
if (uniqueNets.find(ieNode->net) == uniqueNets.end()) {
ld.skip = false;
uniqueNets.insert(ieNode->net);
}
}
}
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
CV_Assert(uniqueNets.size() == 1);
#endif
}

@ -14,6 +14,7 @@
#include "halide_scheduler.hpp"
#include <HalideRuntimeOpenCL.h>
#include <thread>
#endif // HAVE_HALIDE
namespace cv {

@ -453,14 +453,14 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
int w;
for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
barrier(CLK_LOCAL_MEM_FENCE);
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(0, K, local_index))), 0, (__local float *)(slm_brow + mad24(0, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(1, K, local_index))), 0, (__local float *)(slm_brow + mad24(1, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(2, K, local_index))), 0, (__local float *)(slm_brow + mad24(2, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(3, K, local_index))), 0, (__local float *)(slm_brow + mad24(3, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(4, K, local_index))), 0, (__local float *)(slm_brow + mad24(4, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(5, K, local_index))), 0, (__local float *)(slm_brow + mad24(5, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(6, K, local_index))), 0, (__local float *)(slm_brow + mad24(6, SLM_BLOCK, local_index)));
vstore4(vload4(0, (__global float *)(src1_read0 + mad24(7, K, local_index))), 0, (__local float *)(slm_brow + mad24(7, SLM_BLOCK, local_index)));
vstore8(vload8(0, src1_read0 + mad24(0, K, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(1, K, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(2, K, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(3, K, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(4, K, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(5, K, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(6, K, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
vstore8(vload8(0, src1_read0 + mad24(7, K, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
barrier(CLK_LOCAL_MEM_FENCE);
slm_brow0 = slm_brow + local_x * (TILE_K / 8);
@ -469,17 +469,17 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
while( w + TILE_K <= end_w ) {
Dtype8 arow;
brow0 = as_half8(vload4(0, (__local float *)(slm_brow0 + 0 * SLM_BLOCK)));
brow1 = as_half8(vload4(0, (__local float *)(slm_brow0 + 1 * SLM_BLOCK)));
brow2 = as_half8(vload4(0, (__local float *)(slm_brow0 + 2 * SLM_BLOCK)));
brow3 = as_half8(vload4(0, (__local float *)(slm_brow0 + 3 * SLM_BLOCK)));
brow4 = as_half8(vload4(0, (__local float *)(slm_brow0 + 4 * SLM_BLOCK)));
brow5 = as_half8(vload4(0, (__local float *)(slm_brow0 + 5 * SLM_BLOCK)));
brow6 = as_half8(vload4(0, (__local float *)(slm_brow0 + 6 * SLM_BLOCK)));
brow7 = as_half8(vload4(0, (__local float *)(slm_brow0 + 7 * SLM_BLOCK)));
brow0 = vload8(0, slm_brow0 + 0 * SLM_BLOCK);
brow1 = vload8(0, slm_brow0 + 1 * SLM_BLOCK);
brow2 = vload8(0, slm_brow0 + 2 * SLM_BLOCK);
brow3 = vload8(0, slm_brow0 + 3 * SLM_BLOCK);
brow4 = vload8(0, slm_brow0 + 4 * SLM_BLOCK);
brow5 = vload8(0, slm_brow0 + 5 * SLM_BLOCK);
brow6 = vload8(0, slm_brow0 + 6 * SLM_BLOCK);
brow7 = vload8(0, slm_brow0 + 7 * SLM_BLOCK);
#define MM_DOT_PRODUCT( _row, _dot ) \
arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K))); \
arow = vload8(0, src0_read + _row * K); \
_dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \
_dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \
_dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \
@ -510,7 +510,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
Dtype8 arow;
#define READ_BROW(_brow, _row) \
_brow = as_half8(vload4(0, (__local float *)(slm_brow0 + _row * SLM_BLOCK))); \
_brow = vload8(0, slm_brow0 + _row * SLM_BLOCK); \
_brow.s0 = (mad24(local_x, 8, w) < K) ? _brow.s0 : 0.0f; \
_brow.s1 = (mad24(local_x, 8, w + 1) < K) ? _brow.s1 : 0.0f; \
_brow.s2 = (mad24(local_x, 8, w + 2) < K) ? _brow.s2 : 0.0f; \
@ -532,7 +532,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
#undef READ_BROW
#define MM_DOT_PRODUCT( _row, _dot ) \
arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K))); \
arow = vload8(0, src0_read + _row * K); \
arow.s0 = (mad24(local_x, 8, w) < K) ? arow.s0 : 0.0f; \
arow.s1 = (mad24(local_x, 8, w + 1) < K) ? arow.s1 : 0.0f; \
arow.s2 = (mad24(local_x, 8, w + 2) < K) ? arow.s2 : 0.0f; \

@ -1,53 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: qtang@openailab.com
*/
#ifndef TENGINE_GRAPH_CONVOLUTION_HPP
#define TENGINE_GRAPH_CONVOLUTION_HPP
#define FLOAT_TO_REALSIZE (4)
#ifdef HAVE_TENGINE
#include "tengine_c_api.h"
namespace cv
{
namespace dnn
{
// pad_h0: pad_top
// pad_h1: pad_bottom
// pad_w0: pad_left
// pad_w1: pad_right
teng_graph_t tengine_init(const char* name , float* input_, int inch, int group, int in_h, int in_w,
float *output_, int out_b, int outch, int out_h, int out_w,
float *kernel_,int kernel_s , int kernel_h, int kernel_w,
float *teg_bias, int stride_h, int stride_w,
int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w,
size_t wstep, const std::string padMode , teng_graph_t& graph, int nstripes) ;
bool tengine_forward(teng_graph_t& graph) ;
bool tengine_release(teng_graph_t& graph) ;
}
}
#endif
#endif /* TENGINE_GRAPH_CONVOLUTION_HPP */

@ -1,370 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: qtang@openailab.com
*/
#include "../../precomp.hpp"
#include <iostream>
#include <vector>
#include <opencv2/core/utils/configuration.private.hpp>
#include <opencv2/core/utils/logger.hpp>
#include "../include/tengine_graph_convolution.hpp"
#ifdef HAVE_TENGINE
#include "tengine_c_api.h"
namespace cv
{
namespace dnn
{
static int create_input_node(teng_graph_t graph, const char* node_name, int inch, int in_h, int in_w)
{
node_t node = teng_create_graph_node(graph, node_name, "InputOp");
tensor_t tensor = teng_create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
teng_set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
int dims[4] = {1, inch, in_h, in_w};
teng_set_tensor_shape(tensor, dims, 4);
teng_release_graph_tensor(tensor);
teng_release_graph_node(node);
return 0;
}
static int create_conv_node(teng_graph_t graph, const char* node_name, const char* input_name, int in_h, int in_w, int out_h, int out_w,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_h1, int pad_w0, int pad_w1, int inch, int outch, int group,
int dilation_h, int dilation_w, int activation, std::string padMode)
{
node_t conv_node = teng_create_graph_node(graph, node_name, "Convolution");
tensor_t input_tensor = teng_get_graph_tensor(graph, input_name);
if (input_tensor == NULL)
{
CV_LOG_WARNING(NULL,"Tengine: input_tensor is NULL." );
return -1;
}
teng_set_node_input_tensor(conv_node, 0, input_tensor);
teng_release_graph_tensor(input_tensor);
/* output */
tensor_t output_tensor = teng_create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
teng_set_node_output_tensor(conv_node, 0, output_tensor, TENSOR_TYPE_VAR);
teng_release_graph_tensor(output_tensor);
/* weight */
std::string weight_name(node_name);
weight_name += "/weight";
node_t w_node = teng_create_graph_node(graph, weight_name.c_str(), "Const");
tensor_t w_tensor = teng_create_graph_tensor(graph, weight_name.c_str(), TENGINE_DT_FP32);
teng_set_node_output_tensor(w_node, 0, w_tensor, TENSOR_TYPE_CONST);
teng_set_node_input_tensor(conv_node, 1, w_tensor);
int w_dims[] = {outch, inch / group, kernel_h, kernel_w};
teng_set_tensor_shape(w_tensor, w_dims, 4);
teng_release_graph_node(w_node);
teng_release_graph_tensor(w_tensor);
/* bias */
std::string bias_name(node_name);
bias_name += "/bias";
node_t b_node = teng_create_graph_node(graph, bias_name.c_str(), "Const");
tensor_t b_tensor = teng_create_graph_tensor(graph, bias_name.c_str(), TENGINE_DT_FP32);
teng_set_node_output_tensor(b_node, 0, b_tensor, TENSOR_TYPE_CONST);
int b_dims[] = {outch};
teng_set_tensor_shape(b_tensor, b_dims, 1);
teng_set_node_input_tensor(conv_node, 2, b_tensor);
teng_release_graph_node(b_node);
teng_release_graph_tensor(b_tensor);
if (!padMode.empty())
{
if (padMode == "SAME")
{
int out_h_temp = (in_h-kernel_h + 2*pad_h0)/stride_h + 1;
int out_w_temp = (in_w-kernel_w + 2*pad_w0)/stride_w + 1;
if (out_h_temp < out_h)
pad_h1 += 1;
if (out_w_temp < out_w)
pad_w1 += 1;
}
}
/* attr */
teng_set_node_attr_int(conv_node, "kernel_h", &kernel_h);
teng_set_node_attr_int(conv_node, "kernel_w", &kernel_w);
teng_set_node_attr_int(conv_node, "stride_h", &stride_h);
teng_set_node_attr_int(conv_node, "stride_w", &stride_w);
teng_set_node_attr_int(conv_node, "pad_h0", &pad_h0);
teng_set_node_attr_int(conv_node, "pad_w0", &pad_w0);
teng_set_node_attr_int(conv_node, "pad_h1", &pad_h1);
teng_set_node_attr_int(conv_node, "pad_w1", &pad_w1);
teng_set_node_attr_int(conv_node, "output_channel", &outch);
teng_set_node_attr_int(conv_node, "input_channel", &inch);
teng_set_node_attr_int(conv_node, "group", &group);
teng_set_node_attr_int(conv_node, "dilation_h", &dilation_h);
teng_set_node_attr_int(conv_node, "dilation_w", &dilation_w);
// set_node_attr_int(conv_node, "activation", &activation);
teng_release_graph_node(conv_node);
return 0;
}
static teng_graph_t create_conv_graph(const char* layer_name, float* input_data, int inch, int group, int in_h, int in_w,
float* output_data, int outch, int out_h, int out_w,
int kernel_h, int kernel_w,
int stride_h,int stride_w,
int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w, int activation,
float* teg_weight, float* teg_bias, std::string padMode, int nstripes)
{
node_t conv_node = NULL;
tensor_t input_tensor = NULL;
tensor_t output_tensor = NULL;
tensor_t weight_tensor = NULL;
tensor_t bias_tensor = NULL;
/* create graph for convolution */
int in_size = in_h * in_w * inch;
int out_size = out_h * out_w * outch;
int weight_size = outch * (inch / group) * kernel_w * kernel_h;
int bias_size = outch;
int buf_size = 0;
int input_num = 0;
/* create graph */
teng_graph_t graph = teng_create_graph(NULL, NULL, NULL);
bool ok = true;
if(graph == NULL)
{
CV_LOG_WARNING(NULL,"Tengine: create_graph failed." );
ok = false;
}
const char* input_name = "data";
const char* conv_name = layer_name;
if (ok && create_input_node(graph, input_name, inch, in_h, in_w) < 0)
{
CV_LOG_WARNING(NULL,"Tengine: create_input_node failed." );
ok = false;
}
if (ok && create_conv_node(graph, conv_name, input_name, in_h, in_w, out_h, out_w, kernel_h, kernel_w,
stride_h, stride_w, pad_h0, pad_h1, pad_w0, pad_w1, inch, outch, group, dilation_h, dilation_w, activation, padMode) < 0)
{
CV_LOG_WARNING(NULL,"Tengine: create conv node failed." );
ok = false;
}
/* set input/output node */
const char* inputs_name[] = {input_name};
const char* outputs_name[] = {conv_name};
if (ok && teng_set_graph_input_node(graph, inputs_name, sizeof(inputs_name) / sizeof(char*)) < 0)
{
CV_LOG_WARNING(NULL,"Tengine: set inputs failed." );
ok = false;
}
if (ok && teng_set_graph_output_node(graph, outputs_name, sizeof(outputs_name) / sizeof(char*)) < 0)
{
CV_LOG_WARNING(NULL,"Tengine: set outputs failed." );
ok = false;
}
/* set input data */
if (ok)
{
input_tensor = teng_get_graph_input_tensor(graph, 0, 0);
buf_size = teng_get_tensor_buffer_size(input_tensor);
if (buf_size != in_size * FLOAT_TO_REALSIZE)
{
CV_LOG_WARNING(NULL,"Tengine: Input data size check failed.");
ok = false;
}
}
if (ok)
{
teng_set_tensor_buffer(input_tensor, (float *)input_data, buf_size);
teng_release_graph_tensor(input_tensor);
/* create convolution node */
/* set weight node */
conv_node = teng_get_graph_node(graph, conv_name);
weight_tensor = teng_get_node_input_tensor(conv_node, 1);
buf_size = teng_get_tensor_buffer_size(weight_tensor);
if (buf_size != weight_size * FLOAT_TO_REALSIZE)
{
CV_LOG_WARNING(NULL,"Tengine: Input weight size check failed.");
ok = false;
}
}
if (ok)
{
teng_set_tensor_buffer(weight_tensor, teg_weight, buf_size);
/* set bias node */
input_num = teng_get_node_input_number(conv_node);
if (input_num > 2)
{
bias_tensor = teng_get_node_input_tensor(conv_node, 2);
buf_size = teng_get_tensor_buffer_size(bias_tensor);
if (buf_size != bias_size * FLOAT_TO_REALSIZE)
{
CV_LOG_WARNING(NULL,"Tengine: Input bias size check failed.");
ok = false;
}
else teng_set_tensor_buffer(bias_tensor, teg_bias, buf_size);
}
}
/* prerun */
if (ok && teng_prerun_graph_multithread(graph, TENGINE_CLUSTER_BIG, nstripes) < 0)
{
CV_LOG_WARNING(NULL, "Tengine: prerun_graph failed.");
ok = false;
}
if (ok)
{
/* set output data */
output_tensor = teng_get_node_output_tensor(conv_node, 0);
int ret = teng_set_tensor_buffer(output_tensor, output_data, out_size * FLOAT_TO_REALSIZE);
if(ret)
{
CV_LOG_WARNING(NULL,"Tengine: Set output tensor buffer failed." );
ok = false;
}
}
if (false == ok)
{
teng_destroy_graph(graph) ;
return NULL ;
}
return graph;
}
static bool tengine_init_flag = false;
teng_graph_t tengine_init(const char* layer_name, float* input_, int inch, int group, int in_h, int in_w,
float *output_, int out_b, int outch, int out_h, int out_w,
float *kernel_, int kernel_s ,int kernel_h, int kernel_w,
float *teg_bias, int stride_h, int stride_w,
int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w,
size_t wstep, const std::string padMode, teng_graph_t &graph, int nstripes)
{
std::vector<float> teg_weight_vec;
float *teg_weight = NULL;
int kernel_inwh = (inch / group) * kernel_w * kernel_h;
// Do not using the activation fuse mode, just convolution only.
int activation = -1;
if (!(kernel_s == 2 && kernel_h == kernel_w
&& dilation_h == dilation_w && stride_h == stride_w
&& out_b == 1 && pad_h0 < 10 && pad_h1 < 10 && pad_w0 < 10 && pad_w1 < 10)) // just for Conv2D
{
// printf("return : just for Conv2D\n");
return NULL;
}
{
/* printf("Tengine(%s): input (1 x %d x %d x %d),output (%d x %d x %d x %d), kernel (%d x %d), stride (%d x %d), dilation (%d x %d), pad (%d x %d).\n",
layer_name, inch, in_h, in_w,
out_b, outch, out_h, out_w,
kernel_w, kernel_h,
stride_w, stride_h,
dilation_w, dilation_h,
pad_h0, pad_h1, pad_w0, pad_w1);
*/
// weight
if (kernel_inwh != wstep)
{
teg_weight_vec.resize(kernel_inwh * outch);
teg_weight = &teg_weight_vec[0];
for (int i=0; i<outch; i++)
{
memcpy(teg_weight+i*kernel_inwh, kernel_+i*wstep, kernel_inwh*FLOAT_TO_REALSIZE);
}
}
else
{
teg_weight = kernel_;
}
/* initial the resource of tengine */
if(false == tengine_init_flag)
{
init_tengine();
tengine_init_flag = true;
}
/* create the convolution graph */
graph = create_conv_graph(layer_name, input_, inch, group, in_h, in_w,
output_, outch, out_h, out_w,
kernel_h, kernel_w, stride_h,stride_w,
pad_h0, pad_h1, pad_w0, pad_w1, dilation_h, dilation_w, activation,
teg_weight, teg_bias, padMode, nstripes);
if(NULL == graph )
{
return NULL;
}
}
return graph ;
}
bool tengine_forward(teng_graph_t &graph)
{
/* run */
if(teng_run_graph(graph, 1) < 0)
{
CV_LOG_WARNING(NULL,"Tengine: run_graph failed.");
return false ;
}
return true;
}
bool tengine_release(teng_graph_t &graph)
{
teng_postrun_graph(graph);
teng_destroy_graph(graph);
return true;
}
}
}
#endif

@ -194,7 +194,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0;
float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063 : 0.0;
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262 : FLT_MIN;
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
inp, "detection_out", "", scoreDiff, iouDiff, detectionConfThresh);
expectNoFallbacksFromIE(net);
}
@ -237,7 +237,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
scoreDiff = 0.03;
iouDiff = 0.08;
}
processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
inp, "detection_out", "", scoreDiff, iouDiff);
expectNoFallbacksFromIE(net);
}

@ -290,8 +290,8 @@ TEST(Reproducibility_SSD, Accuracy)
typedef testing::TestWithParam<tuple<Backend, Target> > Reproducibility_MobileNet_SSD;
TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
{
const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
const string proto = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false);
const string model = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false);
Net net = readNetFromCaffe(proto, model);
int backendId = get<0>(GetParam());
int targetId = get<1>(GetParam());
@ -731,7 +731,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
#endif
double scoreDiff = 0.0, iouDiff = 0.0;
double scoreDiff = 0.001, iouDiff = 0.03;
#if defined(INF_ENGINE_RELEASE)
if (target == DNN_TARGET_MYRIAD)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
@ -779,7 +779,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
double scoreDiff = 0.0, iouDiff = 0.0;
double scoreDiff = 0.003, iouDiff = 0.07;
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
scoreDiff = 0.02;
iouDiff = 0.13;

@ -407,15 +407,16 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, MaxPooling, Combine(
////////////////////////////////////////////////////////////////////////////////
// Fully-connected
////////////////////////////////////////////////////////////////////////////////
typedef TestWithParam<tuple<int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
typedef TestWithParam<tuple<int, int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
TEST_P(FullyConnected, Accuracy)
{
int inChannels = get<0>(GetParam());
Size inSize = get<1>(GetParam());
int outChannels = get<2>(GetParam());
bool hasBias = get<3>(GetParam());
Backend backendId = get<0>(get<4>(GetParam()));
Target targetId = get<1>(get<4>(GetParam()));
int batch = get<0>(GetParam());
int inChannels = get<1>(GetParam());
Size inSize = get<2>(GetParam());
int outChannels = get<3>(GetParam());
bool hasBias = get<4>(GetParam());
Backend backendId = get<0>(get<5>(GetParam()));
Target targetId = get<1>(get<5>(GetParam()));
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (targetId == DNN_TARGET_OPENCL_FP16 ||
@ -424,6 +425,13 @@ TEST_P(FullyConnected, Accuracy)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
}
#endif
// https://github.com/openvinotoolkit/openvino/issues/19436
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16 && batch == 16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2023000000)
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL && batch == 16)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL);
#endif
Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F);
randu(weights, -1.0f, 1.0f);
@ -439,7 +447,7 @@ TEST_P(FullyConnected, Accuracy)
lp.type = "InnerProduct";
lp.name = "testLayer";
int sz[] = {1, inChannels, inSize.height, inSize.width};
int sz[] = {batch, inChannels, inSize.height, inSize.width};
Mat input(4, &sz[0], CV_32F);
double l1 = 0.0;
@ -453,11 +461,13 @@ TEST_P(FullyConnected, Accuracy)
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16)
{
l1 = 0.01;
if (INF_ENGINE_VER_MAJOR_GE(2023000000))
lInf = 0.016;
}
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL)
{
l1 = 5e-3;
lInf = 7e-3;
lInf = INF_ENGINE_VER_MAJOR_GE(2023000000) ? 0.016 : 7e-3;
}
#endif
if (targetId == DNN_TARGET_CUDA_FP16)
@ -467,6 +477,7 @@ TEST_P(FullyConnected, Accuracy)
}
INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, FullyConnected, Combine(
/*batch*/ Values(1, 2, 4, 8, 16),
/*in channels*/ Values(3, 4),
/*in size*/ Values(Size(5, 4), Size(4, 5), Size(1, 1)),
/*out channels*/ Values(3, 4),

@ -878,14 +878,14 @@ TEST_P(Test_Int8_nets, MobileNet_SSD)
if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy.prototxt", false),
findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false));
Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false),
findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false));
Mat inp = imread(_tf("street.png"));
Mat blob = blobFromImage(inp, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
float confThreshold = FLT_MIN, scoreDiff = 0.059, iouDiff = 0.11;
float confThreshold = FLT_MIN, scoreDiff = 0.084, iouDiff = 0.43;
testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
}

@ -120,6 +120,28 @@ TEST(blobFromImageWithParams_4ch, letter_box)
EXPECT_EQ(0, cvtest::norm(targetBlob, blob, NORM_INF));
}
TEST(blobFromImagesWithParams_4ch, multi_image)
{
Mat img(10, 10, CV_8UC4, cv::Scalar(0, 1, 2, 3));
Scalar scalefactor(0.1, 0.2, 0.3, 0.4);
Image2BlobParams param;
param.scalefactor = scalefactor;
param.datalayout = DNN_LAYOUT_NHWC;
Mat blobs = blobFromImagesWithParams(std::vector<Mat> { img, 2*img }, param);
vector<Range> ranges;
ranges.push_back(Range(0, 1));
ranges.push_back(Range(0, blobs.size[1]));
ranges.push_back(Range(0, blobs.size[2]));
ranges.push_back(Range(0, blobs.size[3]));
Mat blob0 = blobs(ranges);
ranges[0] = Range(1, 2);
Mat blob1 = blobs(ranges);
EXPECT_EQ(0, cvtest::norm(2*blob0, blob1, NORM_INF));
}
TEST(readNet, Regression)
{
Net net = readNet(findDataFile("dnn/squeezenet_v1.1.prototxt"),

@ -490,8 +490,8 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
refBoxes.emplace_back(left, top, width, height);
}
std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
Scalar mean = Scalar(127.5, 127.5, 127.5);
double scale = 1.0 / 127.5;
@ -511,7 +511,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
}
else if (target == DNN_TARGET_CUDA_FP16)
{
scoreDiff = 0.0021;
scoreDiff = 0.0028;
iouDiff = 1e-2;
}
float confThreshold = FLT_MIN;
@ -595,8 +595,8 @@ TEST_P(Test_Model, Detection_normalized)
std::vector<float> refConfidences = {0.999222f};
std::vector<Rect2d> refBoxes = {Rect2d(0, 4, 227, 222)};
std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
Scalar mean = Scalar(127.5, 127.5, 127.5);
double scale = 1.0 / 127.5;

@ -128,6 +128,11 @@ TEST_P(Test_TFLite, max_unpooling)
if (backend == DNN_BACKEND_CUDA)
applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2022010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
#endif
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU) {
if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
@ -152,14 +157,7 @@ TEST_P(Test_TFLite, max_unpooling)
net.setInput(input);
std::vector<std::vector<Mat> > outs;
if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
// TODO: seems like a bug with a retrieving intermediate tensors
net.forward(outs, {"conv2d_transpose_4", "p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
outs.erase(outs.begin());
}
else {
net.forward(outs, {"p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
}
net.forward(outs, {"p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
ASSERT_EQ(outs.size(), 4);
ASSERT_EQ(outs[0].size(), 1);

@ -0,0 +1,135 @@
/*
**
** License Agreement
** For chi_table.h
**
** Copyright (C) 2007 Per-Erik Forssen, all rights reserved.
**
** Redistribution and use in source and binary forms, with or without modification,
** are permitted provided that the following conditions are met:
**
** * Redistribution's of source code must retain the above copyright notice,
** this list of conditions and the following disclaimer.
**
** * Redistribution's in binary form must reproduce the above copyright notice,
** this list of conditions and the following disclaimer in the documentation
** and/or other materials provided with the distribution.
**
** * The name of the copyright holders may not be used to endorse or promote products
** derived from this software without specific prior written permission.
**
** This software is provided by the copyright holders and contributors "as is" and
** any express or implied warranties, including, but not limited to, the implied
** warranties of merchantability and fitness for a particular purpose are disclaimed.
** In no event shall the Intel Corporation or contributors be liable for any direct,
** indirect, incidental, special, exemplary, or consequential damages
** (including, but not limited to, procurement of substitute goods or services;
** loss of use, data, or profits; or business interruption) however caused
** and on any theory of liability, whether in contract, strict liability,
** or tort (including negligence or otherwise) arising in any way out of
** the use of this software, even if advised of the possibility of such damage.
**
** Content origin: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
*/
#define TABLE_SIZE 400
static double chitab3[]={0, 0.0150057, 0.0239478, 0.0315227,
0.0383427, 0.0446605, 0.0506115, 0.0562786,
0.0617174, 0.0669672, 0.0720573, 0.0770099,
0.081843, 0.0865705, 0.0912043, 0.0957541,
0.100228, 0.104633, 0.108976, 0.113261,
0.117493, 0.121676, 0.125814, 0.12991,
0.133967, 0.137987, 0.141974, 0.145929,
0.149853, 0.15375, 0.15762, 0.161466,
0.165287, 0.169087, 0.172866, 0.176625,
0.180365, 0.184088, 0.187794, 0.191483,
0.195158, 0.198819, 0.202466, 0.2061,
0.209722, 0.213332, 0.216932, 0.220521,
0.2241, 0.22767, 0.231231, 0.234783,
0.238328, 0.241865, 0.245395, 0.248918,
0.252435, 0.255947, 0.259452, 0.262952,
0.266448, 0.269939, 0.273425, 0.276908,
0.280386, 0.283862, 0.287334, 0.290803,
0.29427, 0.297734, 0.301197, 0.304657,
0.308115, 0.311573, 0.315028, 0.318483,
0.321937, 0.32539, 0.328843, 0.332296,
0.335749, 0.339201, 0.342654, 0.346108,
0.349562, 0.353017, 0.356473, 0.35993,
0.363389, 0.366849, 0.37031, 0.373774,
0.377239, 0.380706, 0.384176, 0.387648,
0.391123, 0.3946, 0.39808, 0.401563,
0.405049, 0.408539, 0.412032, 0.415528,
0.419028, 0.422531, 0.426039, 0.429551,
0.433066, 0.436586, 0.440111, 0.44364,
0.447173, 0.450712, 0.454255, 0.457803,
0.461356, 0.464915, 0.468479, 0.472049,
0.475624, 0.479205, 0.482792, 0.486384,
0.489983, 0.493588, 0.4972, 0.500818,
0.504442, 0.508073, 0.511711, 0.515356,
0.519008, 0.522667, 0.526334, 0.530008,
0.533689, 0.537378, 0.541075, 0.54478,
0.548492, 0.552213, 0.555942, 0.55968,
0.563425, 0.56718, 0.570943, 0.574715,
0.578497, 0.582287, 0.586086, 0.589895,
0.593713, 0.597541, 0.601379, 0.605227,
0.609084, 0.612952, 0.61683, 0.620718,
0.624617, 0.628526, 0.632447, 0.636378,
0.64032, 0.644274, 0.648239, 0.652215,
0.656203, 0.660203, 0.664215, 0.668238,
0.672274, 0.676323, 0.680384, 0.684457,
0.688543, 0.692643, 0.696755, 0.700881,
0.70502, 0.709172, 0.713339, 0.717519,
0.721714, 0.725922, 0.730145, 0.734383,
0.738636, 0.742903, 0.747185, 0.751483,
0.755796, 0.760125, 0.76447, 0.768831,
0.773208, 0.777601, 0.782011, 0.786438,
0.790882, 0.795343, 0.799821, 0.804318,
0.808831, 0.813363, 0.817913, 0.822482,
0.827069, 0.831676, 0.836301, 0.840946,
0.84561, 0.850295, 0.854999, 0.859724,
0.864469, 0.869235, 0.874022, 0.878831,
0.883661, 0.888513, 0.893387, 0.898284,
0.903204, 0.908146, 0.913112, 0.918101,
0.923114, 0.928152, 0.933214, 0.938301,
0.943413, 0.94855, 0.953713, 0.958903,
0.964119, 0.969361, 0.974631, 0.979929,
0.985254, 0.990608, 0.99599, 1.0014,
1.00684, 1.01231, 1.01781, 1.02335,
1.02891, 1.0345, 1.04013, 1.04579,
1.05148, 1.05721, 1.06296, 1.06876,
1.07459, 1.08045, 1.08635, 1.09228,
1.09826, 1.10427, 1.11032, 1.1164,
1.12253, 1.1287, 1.1349, 1.14115,
1.14744, 1.15377, 1.16015, 1.16656,
1.17303, 1.17954, 1.18609, 1.19269,
1.19934, 1.20603, 1.21278, 1.21958,
1.22642, 1.23332, 1.24027, 1.24727,
1.25433, 1.26144, 1.26861, 1.27584,
1.28312, 1.29047, 1.29787, 1.30534,
1.31287, 1.32046, 1.32812, 1.33585,
1.34364, 1.3515, 1.35943, 1.36744,
1.37551, 1.38367, 1.39189, 1.4002,
1.40859, 1.41705, 1.42561, 1.43424,
1.44296, 1.45177, 1.46068, 1.46967,
1.47876, 1.48795, 1.49723, 1.50662,
1.51611, 1.52571, 1.53541, 1.54523,
1.55517, 1.56522, 1.57539, 1.58568,
1.59611, 1.60666, 1.61735, 1.62817,
1.63914, 1.65025, 1.66152, 1.67293,
1.68451, 1.69625, 1.70815, 1.72023,
1.73249, 1.74494, 1.75757, 1.77041,
1.78344, 1.79669, 1.81016, 1.82385,
1.83777, 1.85194, 1.86635, 1.88103,
1.89598, 1.91121, 1.92674, 1.94257,
1.95871, 1.97519, 1.99201, 2.0092,
2.02676, 2.04471, 2.06309, 2.08189,
2.10115, 2.12089, 2.14114, 2.16192,
2.18326, 2.2052, 2.22777, 2.25101,
2.27496, 2.29966, 2.32518, 2.35156,
2.37886, 2.40717, 2.43655, 2.46709,
2.49889, 2.53206, 2.56673, 2.60305,
2.64117, 2.6813, 2.72367, 2.76854,
2.81623, 2.86714, 2.92173, 2.98059,
3.04446, 3.1143, 3.19135, 3.27731,
3.37455, 3.48653, 3.61862, 3.77982,
3.98692, 4.2776, 4.77167, 133.333 };

@ -0,0 +1,28 @@
License Agreement
For chi_table.h
Copyright (C) 2007 Per-Erik Forssen, all rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistribution's of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistribution's in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* The name of the copyright holders may not be used to endorse or promote products
derived from this software without specific prior written permission.
This software is provided by the copyright holders and contributors "as is" and
any express or implied warranties, including, but not limited to, the implied
warranties of merchantability and fitness for a particular purpose are disclaimed.
In no event shall the Intel Corporation or contributors be liable for any direct,
indirect, incidental, special, exemplary, or consequential damages
(including, but not limited to, procurement of substitute goods or services;
loss of use, data, or profits; or business interruption) however caused
and on any theory of liability, whether in contract, strict liability,
or tort (including negligence or otherwise) arising in any way out of
the use of this software, even if advised of the possibility of such damage.

@ -7,3 +7,5 @@ if(DEBUG_opencv_features2d)
list(APPEND debug_modules opencv_highgui)
endif()
ocv_define_module(features2d opencv_imgproc ${debug_modules} OPTIONAL opencv_flann WRAP java objc python js)
ocv_install_3rdparty_licenses(mscr "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mscr/chi_table_LICENSE.txt")

@ -30,18 +30,23 @@
* OpenCV functions for MSER extraction
*
* 1. there are two different implementation of MSER, one for gray image, one for color image
* 2. the gray image algorithm is taken from: Linear Time Maximally Stable Extremal Regions;
* 2. the gray image algorithm is taken from:
* Linear Time Maximally Stable Extremal Regions;
* the paper claims to be faster than union-find method;
* it actually get 1.5~2m/s on my centrino L7200 1.2GHz laptop.
* 3. the color image algorithm is taken from: Maximally Stable Colour Regions for Recognition and Match;
* 3. the color image algorithm is taken from:
* Maximally Stable Colour Regions for Recognition and Match;
* it should be much slower than gray image method ( 3~4 times );
* the chi_table.h file is taken directly from paper's source code which is distributed under permissive BSD-like license: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
* the chi_table.h file is taken directly from the paper's source code:
* http://users.isy.liu.se/cvl/perfo/software/chi_table.h
* license (BSD-like) is located in the file: 3rdparty/mscr/chi_table_LICENSE.txt
* 4. though the name is *contours*, the result actually is a list of point set.
*/
#include "precomp.hpp"
#include "opencv2/imgproc/imgproc_c.h"
#include <limits>
#include "../3rdparty/mscr/chi_table.h"
namespace cv
{
@ -613,113 +618,6 @@ the color MSER has not been completely refactored yet. We leave it mostly as-is,
with just enough changes to convert C structures to C++ ones and
add support for color images into MSER_Impl::detectAndLabel.
*/
const int TABLE_SIZE = 400;
static const float chitab3[]=
{
0.f, 0.0150057f, 0.0239478f, 0.0315227f,
0.0383427f, 0.0446605f, 0.0506115f, 0.0562786f,
0.0617174f, 0.0669672f, 0.0720573f, 0.0770099f,
0.081843f, 0.0865705f, 0.0912043f, 0.0957541f,
0.100228f, 0.104633f, 0.108976f, 0.113261f,
0.117493f, 0.121676f, 0.125814f, 0.12991f,
0.133967f, 0.137987f, 0.141974f, 0.145929f,
0.149853f, 0.15375f, 0.15762f, 0.161466f,
0.165287f, 0.169087f, 0.172866f, 0.176625f,
0.180365f, 0.184088f, 0.187794f, 0.191483f,
0.195158f, 0.198819f, 0.202466f, 0.2061f,
0.209722f, 0.213332f, 0.216932f, 0.220521f,
0.2241f, 0.22767f, 0.231231f, 0.234783f,
0.238328f, 0.241865f, 0.245395f, 0.248918f,
0.252435f, 0.255947f, 0.259452f, 0.262952f,
0.266448f, 0.269939f, 0.273425f, 0.276908f,
0.280386f, 0.283862f, 0.287334f, 0.290803f,
0.29427f, 0.297734f, 0.301197f, 0.304657f,
0.308115f, 0.311573f, 0.315028f, 0.318483f,
0.321937f, 0.32539f, 0.328843f, 0.332296f,
0.335749f, 0.339201f, 0.342654f, 0.346108f,
0.349562f, 0.353017f, 0.356473f, 0.35993f,
0.363389f, 0.366849f, 0.37031f, 0.373774f,
0.377239f, 0.380706f, 0.384176f, 0.387648f,
0.391123f, 0.3946f, 0.39808f, 0.401563f,
0.405049f, 0.408539f, 0.412032f, 0.415528f,
0.419028f, 0.422531f, 0.426039f, 0.429551f,
0.433066f, 0.436586f, 0.440111f, 0.44364f,
0.447173f, 0.450712f, 0.454255f, 0.457803f,
0.461356f, 0.464915f, 0.468479f, 0.472049f,
0.475624f, 0.479205f, 0.482792f, 0.486384f,
0.489983f, 0.493588f, 0.4972f, 0.500818f,
0.504442f, 0.508073f, 0.511711f, 0.515356f,
0.519008f, 0.522667f, 0.526334f, 0.530008f,
0.533689f, 0.537378f, 0.541075f, 0.54478f,
0.548492f, 0.552213f, 0.555942f, 0.55968f,
0.563425f, 0.56718f, 0.570943f, 0.574715f,
0.578497f, 0.582287f, 0.586086f, 0.589895f,
0.593713f, 0.597541f, 0.601379f, 0.605227f,
0.609084f, 0.612952f, 0.61683f, 0.620718f,
0.624617f, 0.628526f, 0.632447f, 0.636378f,
0.64032f, 0.644274f, 0.648239f, 0.652215f,
0.656203f, 0.660203f, 0.664215f, 0.668238f,
0.672274f, 0.676323f, 0.680384f, 0.684457f,
0.688543f, 0.692643f, 0.696755f, 0.700881f,
0.70502f, 0.709172f, 0.713339f, 0.717519f,
0.721714f, 0.725922f, 0.730145f, 0.734383f,
0.738636f, 0.742903f, 0.747185f, 0.751483f,
0.755796f, 0.760125f, 0.76447f, 0.768831f,
0.773208f, 0.777601f, 0.782011f, 0.786438f,
0.790882f, 0.795343f, 0.799821f, 0.804318f,
0.808831f, 0.813363f, 0.817913f, 0.822482f,
0.827069f, 0.831676f, 0.836301f, 0.840946f,
0.84561f, 0.850295f, 0.854999f, 0.859724f,
0.864469f, 0.869235f, 0.874022f, 0.878831f,
0.883661f, 0.888513f, 0.893387f, 0.898284f,
0.903204f, 0.908146f, 0.913112f, 0.918101f,
0.923114f, 0.928152f, 0.933214f, 0.938301f,
0.943413f, 0.94855f, 0.953713f, 0.958903f,
0.964119f, 0.969361f, 0.974631f, 0.979929f,
0.985254f, 0.990608f, 0.99599f, 1.0014f,
1.00684f, 1.01231f, 1.01781f, 1.02335f,
1.02891f, 1.0345f, 1.04013f, 1.04579f,
1.05148f, 1.05721f, 1.06296f, 1.06876f,
1.07459f, 1.08045f, 1.08635f, 1.09228f,
1.09826f, 1.10427f, 1.11032f, 1.1164f,
1.12253f, 1.1287f, 1.1349f, 1.14115f,
1.14744f, 1.15377f, 1.16015f, 1.16656f,
1.17303f, 1.17954f, 1.18609f, 1.19269f,
1.19934f, 1.20603f, 1.21278f, 1.21958f,
1.22642f, 1.23332f, 1.24027f, 1.24727f,
1.25433f, 1.26144f, 1.26861f, 1.27584f,
1.28312f, 1.29047f, 1.29787f, 1.30534f,
1.31287f, 1.32046f, 1.32812f, 1.33585f,
1.34364f, 1.3515f, 1.35943f, 1.36744f,
1.37551f, 1.38367f, 1.39189f, 1.4002f,
1.40859f, 1.41705f, 1.42561f, 1.43424f,
1.44296f, 1.45177f, 1.46068f, 1.46967f,
1.47876f, 1.48795f, 1.49723f, 1.50662f,
1.51611f, 1.52571f, 1.53541f, 1.54523f,
1.55517f, 1.56522f, 1.57539f, 1.58568f,
1.59611f, 1.60666f, 1.61735f, 1.62817f,
1.63914f, 1.65025f, 1.66152f, 1.67293f,
1.68451f, 1.69625f, 1.70815f, 1.72023f,
1.73249f, 1.74494f, 1.75757f, 1.77041f,
1.78344f, 1.79669f, 1.81016f, 1.82385f,
1.83777f, 1.85194f, 1.86635f, 1.88103f,
1.89598f, 1.91121f, 1.92674f, 1.94257f,
1.95871f, 1.97519f, 1.99201f, 2.0092f,
2.02676f, 2.04471f, 2.06309f, 2.08189f,
2.10115f, 2.12089f, 2.14114f, 2.16192f,
2.18326f, 2.2052f, 2.22777f, 2.25101f,
2.27496f, 2.29966f, 2.32518f, 2.35156f,
2.37886f, 2.40717f, 2.43655f, 2.46709f,
2.49889f, 2.53206f, 2.56673f, 2.60305f,
2.64117f, 2.6813f, 2.72367f, 2.76854f,
2.81623f, 2.86714f, 2.92173f, 2.98059f,
3.04446f, 3.1143f, 3.19135f, 3.27731f,
3.37455f, 3.48653f, 3.61862f, 3.77982f,
3.98692f, 4.2776f, 4.77167f, 133.333f
};
struct MSCRNode;
struct TempMSCR

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save