Merge branch 4.x

pull/23149/head
Alexander Alekhin 2 years ago
commit a42d879925
  1. 24
      cmake/OpenCVDetectCUDA.cmake
  2. 3
      cmake/OpenCVFindFrameworks.cmake
  3. 3
      cmake/checks/cpu_rvv.cpp
  4. 4
      modules/core/CMakeLists.txt
  5. 75
      modules/core/include/opencv2/core/hal/intrin_rvv.hpp
  6. 40
      modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
  7. 2
      modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
  8. 12
      modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
  9. 2
      modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
  10. 4
      modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
  11. 5
      modules/dnn/test/test_onnx_importer.cpp
  12. 2
      modules/ts/include/opencv2/ts/cuda_test.hpp
  13. 4
      modules/video/CMakeLists.txt
  14. 2
      platforms/js/opencv_js.config.py
  15. 2
      samples/python/stitching_detailed.py

@ -101,18 +101,20 @@ if(CUDA_FOUND)
message(STATUS "CUDA detected: " ${CUDA_VERSION}) message(STATUS "CUDA detected: " ${CUDA_VERSION})
OCV_OPTION(CUDA_ENABLE_DEPRECATED_GENERATION "Enable deprecated generations in the list" OFF) OCV_OPTION(CUDA_ENABLE_DEPRECATED_GENERATION "Enable deprecated generations in the list" OFF)
set(_generations "Maxwell" "Pascal" "Volta" "Turing" "Ampere") set(_generations "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "Lovelace" "Hopper")
if(CUDA_ENABLE_DEPRECATED_GENERATION) if(CUDA_ENABLE_DEPRECATED_GENERATION)
set(_generations "Fermi" "${_generations}") set(_generations "Fermi" "${_generations}")
set(_generations "Kepler" "${_generations}") set(_generations "Kepler" "${_generations}")
endif() endif()
set(_arch_fermi "2.0") set(_arch_fermi "2.0")
set(_arch_kepler "3.0;3.5;3.7") set(_arch_kepler "3.0;3.5;3.7")
set(_arch_maxwell "5.0;5.2") set(_arch_maxwell "5.0;5.2")
set(_arch_pascal "6.0;6.1") set(_arch_pascal "6.0;6.1")
set(_arch_volta "7.0") set(_arch_volta "7.0")
set(_arch_turing "7.5") set(_arch_turing "7.5")
set(_arch_ampere "8.0;8.6") set(_arch_ampere "8.0;8.6")
set(_arch_lovelace "8.9")
set(_arch_hopper "9.0")
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
list(APPEND _generations "Auto") list(APPEND _generations "Auto")
endif() endif()
@ -241,6 +243,10 @@ if(CUDA_FOUND)
set(__cuda_arch_bin ${_arch_turing}) set(__cuda_arch_bin ${_arch_turing})
elseif(CUDA_GENERATION STREQUAL "Ampere") elseif(CUDA_GENERATION STREQUAL "Ampere")
set(__cuda_arch_bin ${_arch_ampere}) set(__cuda_arch_bin ${_arch_ampere})
elseif(CUDA_GENERATION STREQUAL "Lovelace")
set(__cuda_arch_bin ${_arch_lovelace})
elseif(CUDA_GENERATION STREQUAL "Hopper")
set(__cuda_arch_bin ${_arch_hopper})
elseif(CUDA_GENERATION STREQUAL "Auto") elseif(CUDA_GENERATION STREQUAL "Auto")
ocv_detect_native_cuda_arch(_nvcc_res _nvcc_out) ocv_detect_native_cuda_arch(_nvcc_res _nvcc_out)
if(NOT _nvcc_res EQUAL 0) if(NOT _nvcc_res EQUAL 0)
@ -286,6 +292,8 @@ if(CUDA_FOUND)
${_arch_volta} ${_arch_volta}
${_arch_turing} ${_arch_turing}
${_arch_ampere} ${_arch_ampere}
${_arch_lovelace}
${_arch_hopper}
) )
endif() endif()
endif() endif()

@ -32,6 +32,9 @@ if(WITH_OPENMP)
if(OPENMP_FOUND) if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
if(DEFINED OpenMP_CXX_INCLUDE_DIRS AND OpenMP_CXX_INCLUDE_DIRS)
ocv_include_directories(${OpenMP_CXX_INCLUDE_DIRS})
endif()
endif() endif()
set(HAVE_OPENMP "${OPENMP_FOUND}") set(HAVE_OPENMP "${OPENMP_FOUND}")
endif() endif()

@ -9,6 +9,9 @@
int test() int test()
{ {
const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f }; const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f };
uint64_t ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
vuint8m1_t a = vreinterpret_v_u64m1_u8m1(vle64_v_u64m1(ptr, 2));
//vuint8m1_t a = (vuint8m1_t)vle64_v_u64m1(ptr, 2);
vfloat32m1_t val = vle32_v_f32m1((const float*)(src), 4); vfloat32m1_t val = vle32_v_f32m1((const float*)(src), 4);
return (int)vfmv_f_s_f32m1_f32(val); return (int)vfmv_f_s_f32m1_f32(val);
} }

@ -168,6 +168,10 @@ if(HAVE_HPX)
ocv_target_link_libraries(${the_module} LINK_PRIVATE "${HPX_LIBRARIES}") ocv_target_link_libraries(${the_module} LINK_PRIVATE "${HPX_LIBRARIES}")
endif() endif()
if(HAVE_OPENMP AND DEFINED OpenMP_CXX_LIBRARIES AND OpenMP_CXX_LIBRARIES)
ocv_target_link_libraries(${the_module} LINK_PRIVATE "${OpenMP_CXX_LIBRARIES}")
endif()
ocv_add_accuracy_tests() ocv_add_accuracy_tests()
ocv_add_perf_tests() ocv_add_perf_tests()

@ -1920,20 +1920,29 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \ #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
inline bool v_check_all(const _Tpvec& a) \ inline bool v_check_all(const _Tpvec& a) \
{ \ { \
v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl)); \ v_uint64x2 v = v_uint64x2(vreinterpret_v_##suffix##m1_u64m1(vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl))); \
return (v.val[0] | v.val[1]) == 0; \ return (v.val[0] | v.val[1]) == 0; \
} \ } \
inline bool v_check_any(const _Tpvec& a) \ inline bool v_check_any(const _Tpvec& a) \
{ \ { \
v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift, vl)); \ v_uint64x2 v = v_uint64x2(vreinterpret_v_##suffix##m1_u64m1(vsrl_vx_##suffix##m1(a, shift, vl))); \
return (v.val[0] | v.val[1]) != 0; \ return (v.val[0] | v.val[1]) != 0; \
} }
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16) OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 8) OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 8)
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 4) OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 4)
OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 2) //OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 2)
inline bool v_check_all(const v_uint64x2& a)
{
v_uint64x2 v = v_uint64x2(vsrl_vx_u64m1(vnot_v_u64m1(a, 2), 63, 2));
return (v.val[0] | v.val[1]) == 0;
}
inline bool v_check_any(const v_uint64x2& a)
{
v_uint64x2 v = v_uint64x2(vsrl_vx_u64m1(a, 63, 2));
return (v.val[0] | v.val[1]) != 0;
}
inline bool v_check_all(const v_int8x16& a) inline bool v_check_all(const v_int8x16& a)
{ return v_check_all(v_reinterpret_as_u8(a)); } { return v_check_all(v_reinterpret_as_u8(a)); }
@ -2035,15 +2044,15 @@ OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
// use reinterpret instead of c-style casting. // use reinterpret instead of c-style casting.
#ifndef __clang__ #ifndef __clang__
#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, vl) \ #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width, vl) \
inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
{ \ { \
return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b), vl), 0, vl)); \ return _rTpvec(rshr(vreinterpret_v_i##width##m2_u##width##m2(sub(v_max(a, b), v_min(a, b), vl)), 0, vl)); \
} }
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16) OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16, 16)
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 8) OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 32, 8)
OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 4) OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 64, 4)
#else #else
#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width, vl) \ #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width, vl) \
inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \ inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
@ -2806,12 +2815,15 @@ OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
//////////// Pack triplets //////////// //////////// Pack triplets ////////////
// use reinterpret instead of c-style casting.
#ifndef __clang__
inline v_int8x16 v_pack_triplets(const v_int8x16& vec) inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
{ {
uint64 ptr[2] = {0x0908060504020100, 0xFFFFFF0F0E0D0C0A}; const uint64 ptr[2] = {0x0908060504020100, 0xFFFFFF0F0E0D0C0A};
return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)vint8m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16)); const v_uint64x2 flags(vle64_v_u64m1(ptr, 2));
return v_reinterpret_as_s8(v_uint8x16(
vrgather_vv_u8m1(
v_reinterpret_as_u8(vec),
v_reinterpret_as_u8(flags),
16)));
} }
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
{ {
@ -2820,8 +2832,13 @@ inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
inline v_int16x8 v_pack_triplets(const v_int16x8& vec) inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
{ {
uint64 ptr[2] = {0x0908050403020100, 0xFFFF0F0E0D0C0B0A}; const uint64 ptr[2] = {0x0908050403020100, 0xFFFF0F0E0D0C0B0A};
return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vint16m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16)); const v_uint64x2 flags(vle64_v_u64m1(ptr, 2));
return v_reinterpret_as_s16(v_uint8x16(
vrgather_vv_u8m1(
v_reinterpret_as_u8(vec),
v_reinterpret_as_u8(flags),
16)));
} }
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
{ {
@ -2832,34 +2849,6 @@ inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
#else
inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
{
uint64 ptr[2] = {0x0908060504020100, 0xFFFFFF0F0E0D0C0A};
return v_int8x16(vreinterpret_i8m1(vrgather_vv_u8m1(v_reinterpret_as_u8(vec), vreinterpret_u8m1(vle64_v_u64m1(ptr, 2)), 16)));
}
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
{
return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
}
inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
{
uint64 ptr[2] = {0x0908050403020100, 0xFFFF0F0E0D0C0B0A};
return v_int16x8(v_reinterpret_as_s16(v_uint8x16(vrgather_vv_u8m1(v_reinterpret_as_u8(vec), vreinterpret_u8m1(vle64_v_u64m1(ptr, 2)), 16))));
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
{
return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
}
inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
#endif
////// FP16 support /////// ////// FP16 support ///////
#if CV_FP16 #if CV_FP16

@ -24,7 +24,7 @@ static void depthWiseBlockConv2D(const float* wptr,
const float* inptr_, const float* inptr_,
int height, int width, int height, int width,
float* outptr_, float* outptr_,
int out_d, int outH, int outW) int out_d, int outH, int outW, bool fusedAdd)
{ {
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
@ -57,6 +57,8 @@ static void depthWiseBlockConv2D(const float* wptr,
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (fusedAdd)
out += outptr[0];
if (relu) if (relu)
out = out > 0.f ? out : out*relu_coeff; out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out; outptr[0] = out;
@ -65,6 +67,10 @@ static void depthWiseBlockConv2D(const float* wptr,
#if CV_SIMD128 #if CV_SIMD128
const int VEC_NLANES = 4; const int VEC_NLANES = 4;
if (fusedAdd)
outW1 = max(out_j, outW1 - outW1%VEC_NLANES);
v_float32x4 vw00 = v_setall_f32(w00); v_float32x4 vw00 = v_setall_f32(w00);
v_float32x4 vw01 = v_setall_f32(w01); v_float32x4 vw01 = v_setall_f32(w01);
v_float32x4 vw02 = v_setall_f32(w02); v_float32x4 vw02 = v_setall_f32(w02);
@ -104,6 +110,8 @@ static void depthWiseBlockConv2D(const float* wptr,
v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 +
v10*vw10 + v11*vw11 + v12*vw12 + v10*vw10 + v11*vw11 + v12*vw12 +
v20*vw20 + v21*vw21 + v22*vw22 + vbias; v20*vw20 + v21*vw21 + v22*vw22 + vbias;
if (fusedAdd)
vout = v_load(outptr + out_j) + vout;
if (relu) if (relu)
vout = v_select(vout > z, vout, vout*vrc); vout = v_select(vout > z, vout, vout*vrc);
v_store(outptr + out_j, vout); v_store(outptr + out_j, vout);
@ -134,6 +142,8 @@ static void depthWiseBlockConv2D(const float* wptr,
v10 * vw10 + v11 * vw11 + v12 * vw12 + v10 * vw10 + v11 * vw11 + v12 * vw12 +
v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias; v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias;
if (fusedAdd)
vout = v_load(outptr + out_j) + vout;
if (relu) if (relu)
vout = v_select(vout > z, vout, vout*vrc); vout = v_select(vout > z, vout, vout*vrc);
v_store(outptr + out_j, vout); v_store(outptr + out_j, vout);
@ -148,6 +158,8 @@ static void depthWiseBlockConv2D(const float* wptr,
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (fusedAdd)
out += outptr[out_j];
if (relu) if (relu)
out = out > 0.f ? out : out*relu_coeff; out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out; outptr[out_j] = out;
@ -175,6 +187,8 @@ static void depthWiseBlockConv2D(const float* wptr,
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (fusedAdd)
out += outptr[out_j];
if (relu) if (relu)
out = out > 0.f ? out : out*relu_coeff; out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out; outptr[out_j] = out;
@ -187,7 +201,7 @@ static void depthWiseBlockConv1D(const float* wptr,
const float* biasptr, const float* relu, const float* biasptr, const float* relu,
const float* inptr_, int width, const float* inptr_, int width,
float* outptr_, float* outptr_,
int out_d, int outW) int out_d, int outW, bool fusedAdd)
{ {
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2]; const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2];
int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l)/stride_w); int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l)/stride_w);
@ -201,7 +215,8 @@ static void depthWiseBlockConv1D(const float* wptr,
if (pad_l > 0) if (pad_l > 0)
{ {
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + bias; out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + bias;
if (fusedAdd)
out += outptr[0];
if (relu) if (relu)
out = out > 0.f ? out : out*relu_coeff; out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out; outptr[0] = out;
@ -210,6 +225,8 @@ static void depthWiseBlockConv1D(const float* wptr,
#if CV_SIMD128 #if CV_SIMD128
const int VEC_NLANES = 4; const int VEC_NLANES = 4;
if (fusedAdd)
outW1 = max(out_j, outW1 - outW1%VEC_NLANES);
v_float32x4 vw00 = v_setall_f32(w00); v_float32x4 vw00 = v_setall_f32(w00);
v_float32x4 vw01 = v_setall_f32(w01); v_float32x4 vw01 = v_setall_f32(w01);
v_float32x4 vw02 = v_setall_f32(w02); v_float32x4 vw02 = v_setall_f32(w02);
@ -235,6 +252,8 @@ static void depthWiseBlockConv1D(const float* wptr,
v02 = v_load(imgptr0 + in_j + dilation_w*2); v02 = v_load(imgptr0 + in_j + dilation_w*2);
v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias; v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias;
if (fusedAdd)
vout = v_load(outptr + out_j) + vout;
if (relu) if (relu)
vout = v_select(vout > z, vout, vout*vrc); vout = v_select(vout > z, vout, vout*vrc);
v_store(outptr + out_j, vout); v_store(outptr + out_j, vout);
@ -258,6 +277,9 @@ static void depthWiseBlockConv1D(const float* wptr,
v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias; v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias;
if (fusedAdd)
vout = v_load(outptr + out_j) + vout;
if (relu) if (relu)
vout = v_select(vout > z, vout, vout*vrc); vout = v_select(vout > z, vout, vout*vrc);
v_store(outptr + out_j, vout); v_store(outptr + out_j, vout);
@ -270,6 +292,8 @@ static void depthWiseBlockConv1D(const float* wptr,
{ {
int in_j = out_j * stride_w - pad_l; int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + bias; out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + bias;
if (fusedAdd)
out += outptr[out_j];
if (relu) if (relu)
out = out > 0.f ? out : out*relu_coeff; out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out; outptr[out_j] = out;
@ -295,6 +319,8 @@ static void depthWiseBlockConv1D(const float* wptr,
s2 = 0.f; s2 = 0.f;
} }
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + bias; out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + bias;
if (fusedAdd)
out += outptr[out_j];
if (relu) if (relu)
out = out > 0.f ? out : out*relu_coeff; out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out; outptr[out_j] = out;
@ -302,7 +328,7 @@ static void depthWiseBlockConv1D(const float* wptr,
} }
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_, void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_,
const std::vector<float>& reluslope) const std::vector<float>& reluslope, bool fusedAdd)
{ {
Mat input = _input.getMat(); Mat input = _input.getMat();
Mat output = _output.getMat(); Mat output = _output.getMat();
@ -349,7 +375,7 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV #if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV
// TODO: remove the following limitation, need change code in layers_common.simd.hpp. // TODO: remove the following limitation, need change code in layers_common.simd.hpp.
bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1); bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd;
#endif #endif
std::vector<int> ofstab_(3 * ksize, 0); std::vector<int> ofstab_(3 * ksize, 0);
int *ofstab = ofstab_.data(); int *ofstab = ofstab_.data();
@ -399,11 +425,11 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
else else
#endif #endif
depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd);
} }
else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D. else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D.
{ {
depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0); depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd);
} }
if (activ) if (activ)

@ -119,7 +119,7 @@ void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, b
void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, void _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
float* outbuf, int Cg, int iblock) float* outbuf, int Cg, int iblock)
{ {
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4);// && _FX_WINO_ATOM_F32 == 8); CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 8);
if (iblock > 3) if (iblock > 3)
{ {
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,

@ -105,6 +105,12 @@ Ptr<FastConv> initFastConv(
conv->conv_type = _FX_CONV_TYPE_GENERIC; conv->conv_type = _FX_CONV_TYPE_GENERIC;
#endif #endif
#if CV_TRY_AVX2
// Disabel Winograd when CV_TRY_AVX2 is true, but conv->useAVX2 is false.
if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && !conv->useAVX2)
conv->conv_type = _FX_CONV_TYPE_GENERIC;
#endif
Mat weightsMat = _weightsMat.getMat(); Mat weightsMat = _weightsMat.getMat();
auto wShape = shape(weightsMat); auto wShape = shape(weightsMat);
const size_t wstep = weightsMat.step1(); const size_t wstep = weightsMat.step1();
@ -257,7 +263,7 @@ Ptr<FastConv> initFastConv(
// we can always read MR elements starting from any valid index // we can always read MR elements starting from any valid index
{ {
int k = 0, nbias = K + VEC_ALIGN; int k = 0, nbias = K + VEC_ALIGN;
conv->biasBuf.reserve(nbias); conv->biasBuf.resize(nbias);
float* biasBufPtr = conv->biasBuf.data(); float* biasBufPtr = conv->biasBuf.data();
for(; k < K; k++) for(; k < K; k++)
biasBufPtr[k] = srcBias ? srcBias[k] : 0.f; biasBufPtr[k] = srcBias ? srcBias[k] : 0.f;
@ -369,8 +375,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE) if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE)
{ {
// Depthwise-Convolution layer should not be followed by Add layer. // Depthwise-Convolution layer should not be followed by Add layer.
CV_Assert(fusedAddMat.empty() && (conv_dim == CONV_1D || conv_dim == CONV_2D)); CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D));
return runDepthwise(input, output, conv,actLayer.get(), reluslope); return runDepthwise(input, output, conv, actLayer.get(), reluslope, fusedAdd);
} }
MatShape inputShape = shape(input); MatShape inputShape = shape(input);

@ -100,7 +100,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd); const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd);
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ, void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ,
const std::vector<float>& reluslope); const std::vector<float>& reluslope, bool fusedAdd);
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, int ntasks, int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct); float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);

@ -22,7 +22,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
float* outbuf, int Cg, int iblock) float* outbuf, int Cg, int iblock)
{ {
#if CV_NEON && CV_NEON_AARCH64 #if CV_NEON && CV_NEON_AARCH64
CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4); CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 4);
if (iblock > 3) if (iblock > 3)
{ {
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
@ -144,7 +144,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
} }
} }
#elif CV_SIMD128 #elif CV_SIMD128
CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4); CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 4);
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
outbuf += _FX_WINO_ATOM_F32) outbuf += _FX_WINO_ATOM_F32)
{ {

@ -1726,6 +1726,11 @@ TEST_P(Test_ONNX_layers, ConvResizePool1d)
testONNXModels("conv_resize_pool_1d"); testONNXModels("conv_resize_pool_1d");
} }
TEST_P(Test_ONNX_layers, DepthWiseAdd)
{
testONNXModels("depthwiseconv_add");
}
TEST_P(Test_ONNX_layers, SubFromConst) TEST_P(Test_ONNX_layers, SubFromConst)
{ {
testONNXModels("sub_from_const1"); testONNXModels("sub_from_const1");

@ -212,6 +212,8 @@ namespace cvtest
#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113)) #define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
#define DIFFERENT_SIZES_EXTRA testing::Values(cv::Size(13, 1), cv::Size(1, 13), cv::Size(128, 128), cv::Size(113, 113))
// Depth // Depth
using perf::MatDepth; using perf::MatDepth;

@ -10,3 +10,7 @@ ocv_define_module(video
python python
js js
) )
if(HAVE_OPENMP AND DEFINED OpenMP_CXX_LIBRARIES AND OpenMP_CXX_LIBRARIES)
ocv_target_link_libraries(${the_module} LINK_PRIVATE "${OpenMP_CXX_LIBRARIES}")
endif()

@ -112,7 +112,7 @@ objdetect = {'': ['groupRectangles'],
'HOGDescriptor': ['load', 'HOGDescriptor', 'getDefaultPeopleDetector', 'getDaimlerPeopleDetector', 'setSVMDetector', 'detectMultiScale'], 'HOGDescriptor': ['load', 'HOGDescriptor', 'getDefaultPeopleDetector', 'getDaimlerPeopleDetector', 'setSVMDetector', 'detectMultiScale'],
'CascadeClassifier': ['load', 'detectMultiScale2', 'CascadeClassifier', 'detectMultiScale3', 'empty', 'detectMultiScale'], 'CascadeClassifier': ['load', 'detectMultiScale2', 'CascadeClassifier', 'detectMultiScale3', 'empty', 'detectMultiScale'],
'QRCodeDetector': ['QRCodeDetector', 'decode', 'decodeCurved', 'detect', 'detectAndDecode', 'detectMulti', 'setEpsX', 'setEpsY'], 'QRCodeDetector': ['QRCodeDetector', 'decode', 'decodeCurved', 'detect', 'detectAndDecode', 'detectMulti', 'setEpsX', 'setEpsY'],
'ArucoDetector': ['getPredefinedDictionary', 'detectMarkers', 'refineDetectedMarkers', 'getDictionary', 'stetDictionary', 'getDetectorParameters', 'setDetectorParameters', 'getRefineParameters', 'setRefineParameters'], 'ArucoDetector': ['getPredefinedDictionary', 'detectMarkers', 'refineDetectedMarkers', 'getDictionary', 'setDictionary', 'getDetectorParameters', 'setDetectorParameters', 'getRefineParameters', 'setRefineParameters'],
'GridBoard': ['create','generateImage', 'getGridSize', 'getMarkerLength', 'getMarkerSeparation'], 'GridBoard': ['create','generateImage', 'getGridSize', 'getMarkerLength', 'getMarkerSeparation'],
'CharucoBoard': ['create', 'generateImage', 'getChessboardCorners', 'getNearestMarkerCorners', 'checkCharucoCornersCollinear'] 'CharucoBoard': ['create', 'generateImage', 'getChessboardCorners', 'getNearestMarkerCorners', 'checkCharucoCornersCollinear']
} }

@ -49,6 +49,8 @@ except AttributeError:
print("AKAZE not available") print("AKAZE not available")
SEAM_FIND_CHOICES = OrderedDict() SEAM_FIND_CHOICES = OrderedDict()
SEAM_FIND_CHOICES['gc_color'] = cv.detail_GraphCutSeamFinder('COST_COLOR')
SEAM_FIND_CHOICES['gc_colorgrad'] = cv.detail_GraphCutSeamFinder('COST_COLOR_GRAD')
SEAM_FIND_CHOICES['dp_color'] = cv.detail_DpSeamFinder('COLOR') SEAM_FIND_CHOICES['dp_color'] = cv.detail_DpSeamFinder('COLOR')
SEAM_FIND_CHOICES['dp_colorgrad'] = cv.detail_DpSeamFinder('COLOR_GRAD') SEAM_FIND_CHOICES['dp_colorgrad'] = cv.detail_DpSeamFinder('COLOR_GRAD')
SEAM_FIND_CHOICES['voronoi'] = cv.detail.SeamFinder_createDefault(cv.detail.SeamFinder_VORONOI_SEAM) SEAM_FIND_CHOICES['voronoi'] = cv.detail.SeamFinder_createDefault(cv.detail.SeamFinder_VORONOI_SEAM)

Loading…
Cancel
Save