Merge remote-tracking branch 'upstream/3.4' into merge-3.4

5 years ago · ad0ab4109a
parent a4d16acd00 373160ce00
commit ad0ab4109a
19 changed files with 669 additions and 41 deletions
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@ -90,6 +90,50 @@ inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
    return _mm256_packus_epi32(am, bm);
 }

+template<int i>
+inline int _v256_extract_epi8(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi8(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
+    return _mm_extract_epi8(b, i & 15);  // SSE4.1
+#endif
+}
+
+template<int i>
+inline int _v256_extract_epi16(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi16(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
+    return _mm_extract_epi16(b, i & 7);  // SSE2
+#endif
+}
+
+template<int i>
+inline int _v256_extract_epi32(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi32(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
+    return _mm_extract_epi32(b, i & 3);  // SSE4.1
+#endif
+}
+
+template<int i>
+inline int64 _v256_extract_epi64(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi64(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
+    return _mm_extract_epi64(b, i & 1);  // SSE4.1
+#endif
+}
+
 ///////// Types ////////////

 struct v_uint8x32
@ -2195,6 +2239,85 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)

+template<int i>
+inline uchar v_extract_n(v_uint8x32 a)
+{
+    return (uchar)_v256_extract_epi8<i>(a.val);
+}
+
+template<int i>
+inline schar v_extract_n(v_int8x32 a)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
+}
+
+template<int i>
+inline ushort v_extract_n(v_uint16x16 a)
+{
+    return (ushort)_v256_extract_epi16<i>(a.val);
+}
+
+template<int i>
+inline short v_extract_n(v_int16x16 a)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
+}
+
+template<int i>
+inline uint v_extract_n(v_uint32x8 a)
+{
+    return (uint)_v256_extract_epi32<i>(a.val);
+}
+
+template<int i>
+inline int v_extract_n(v_int32x8 a)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
+}
+
+template<int i>
+inline uint64 v_extract_n(v_uint64x4 a)
+{
+    return (uint64)_v256_extract_epi64<i>(a.val);
+}
+
+template<int i>
+inline int64 v_extract_n(v_int64x4 v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(v_float32x8 v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(v_float64x4 v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
+{
+    static const __m256i perm = _mm256_set1_epi32((char)i);
+    return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
+}
+
+template<int i>
+inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+

 ///////////////////// load deinterleave /////////////////////////////

--- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp
@ -2228,6 +2228,35 @@ OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
 OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)

+#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)
+
+template<int i>
+inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
+{
+    static const __m512i perm = _mm512_set1_epi32((char)i);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+template<int i>
+inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+

 ///////////////////// load deinterleave /////////////////////////////

--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -217,6 +217,8 @@ Regular integers:
 |cvt_flt64          |   |   |   |   |   | x |
 |transpose4x4       |   |   |   |   | x | x |
 |reverse            | x | x | x | x | x | x |
+|extract_n          | x | x | x | x | x | x |
+|broadcast_element  |   |   |   |   | x | x |

 Big integers:

@ -230,6 +232,7 @@ Big integers:
 |extract            | x | x |
 |rotate (lanes)     | x | x |
 |cvt_flt64          |   | x |
+|extract_n          | x | x |

 Floating point:

@ -254,6 +257,8 @@ Floating point:
 |extract            | x | x |
 |rotate (lanes)     | x | x |
 |reverse            | x | x |
+|extract_n          | x | x |
+|broadcast_element  | x |   |

 @{ */

@ -1784,6 +1789,42 @@ inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
    return r;
 }

+/** @brief Vector extract
+
+Scheme:
+Return the s-th element of v.
+Restriction: 0 <= s < nlanes
+
+Usage:
+@code
+v_int32x4 a;
+int r;
+r = v_extract_n<2>(a);
+@endcode
+For all types. */
+template<int s, typename _Tp, int n>
+inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
+{
+    CV_DbgAssert(s >= 0 && s < n);
+    return v.s[s];
+}
+
+/** @brief Broadcast i-th element of vector
+
+Scheme:
+@code
+{ v[0] v[1] v[2] ... v[SZ] } => { v[i], v[i], v[i] ... v[i] }
+@endcode
+Restriction: 0 <= i < nlanes
+Supported types: 32-bit integers and floats (s32/u32/f32)
+ */
+template<int i, typename _Tp, int n>
+inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
+{
+    CV_DbgAssert(i >= 0 && i < n);
+    return v_reg<_Tp, n>::all(a.s[i]);
+}
+
 /** @brief Round

 Rounds each value. Input type is float vector ==> output type is int vector.*/
--- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp
@ -1783,6 +1783,28 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
    y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
 }

+template<int i, typename _Tp>
+inline typename _Tp::lane_type v_extract_n(const _Tp& a)
+{
+    return v_rotate_right<i>(a).get0();
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{
+    return v_setall_u32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{
+    return v_setall_s32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{
+    return v_setall_f32(v_extract_n<i>(a));
+}
+
 ////// FP16 suport ///////
 #if CV_FP16
 inline v_float32x4 v_load_expand(const float16_t* ptr)
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -1651,6 +1651,38 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
 OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
 #endif

+#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
+
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
+
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
+#endif
+
 #if CV_SIMD128_64F
 inline v_int32x4 v_round(const v_float32x4& a)
 {
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@ -57,6 +57,14 @@ namespace cv

 //! @cond IGNORED

+//
+// Compilation troubleshooting:
+// - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
+//   Replace parameter declaration to const reference:
+//   -v_int32x4 a
+//   +const v_int32x4& a
+//
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

 ///////// Types ////////////
@ -3270,6 +3278,100 @@ inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }

+template<int i>
+inline uchar v_extract_n(const v_uint8x16& v)
+{
+#if CV_SSE4_1
+    return (uchar)_mm_extract_epi8(v.val, i);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline schar v_extract_n(const v_int8x16& v)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
+}
+
+template<int i>
+inline ushort v_extract_n(const v_uint16x8& v)
+{
+    return (ushort)_mm_extract_epi16(v.val, i);
+}
+
+template<int i>
+inline short v_extract_n(const v_int16x8& v)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
+}
+
+template<int i>
+inline uint v_extract_n(const v_uint32x4& v)
+{
+#if CV_SSE4_1
+    return (uint)_mm_extract_epi32(v.val, i);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline int v_extract_n(const v_int32x4& v)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
+}
+
+template<int i>
+inline uint64 v_extract_n(const v_uint64x2& v)
+{
+#ifdef CV__SIMD_NATIVE_mm_extract_epi64
+    return (uint64)_v128_extract_epi64<i>(v.val);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline int64 v_extract_n(const v_int64x2& v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(const v_float32x4& v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(const v_float64x2& v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& v)
+{
+    return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& v)
+{
+    return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
+}
+
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& v)
+{
+    return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
+}
+
 ////////////// FP16 support ///////////////////////////

 inline v_float32x4 v_load_expand(const float16_t* ptr)
--- a/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
@ -158,10 +158,23 @@ inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
 #endif
 }

+template<int i>
+inline int64 _v128_extract_epi64(const __m128i& a)
+{
+#if defined(CV__SIMD_HAVE_mm_extract_epi64) || (CV_SSE4_1 && (defined(__x86_64__)/*GCC*/ || defined(_M_X64)/*MSVC*/))
+#define CV__SIMD_NATIVE_mm_extract_epi64 1
+    return _mm_extract_epi64(a, i);
+#else
+    CV_DECL_ALIGNED(16) int64 tmp[2];
+    _mm_store_si128((__m128i*)tmp, a);
+    return tmp[i];
+#endif
+}
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond

 } // cv::

-#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
+#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@ -206,6 +206,20 @@ struct v_float64x2
    { return vec_extract(val, 0); }
 };

+#define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \
+template<int i> inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); }
+
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int8x16, schar)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int16x8, short)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint32x4, uint)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int32x4, int)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int64x2, int64)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float32x4, float)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
+
 //////////////// Load and store operations ///////////////

 /*
@ -1524,6 +1538,82 @@ OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
 OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)

+template<int i>
+inline v_int8x16 v_broadcast_element(v_int8x16 v)
+{
+    return v_int8x16(vec_perm(v.val, v.val, vec_splats((unsigned char)i)));
+}
+
+template<int i>
+inline v_uint8x16 v_broadcast_element(v_uint8x16 v)
+{
+    return v_uint8x16(vec_perm(v.val, v.val, vec_splats((unsigned char)i)));
+}
+
+template<int i>
+inline v_int16x8 v_broadcast_element(v_int16x8 v)
+{
+    unsigned char t0 = 2*i, t1 = 2*i + 1;
+    vec_uchar16 p = {t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1};
+    return v_int16x8(vec_perm(v.val, v.val, p));
+}
+
+template<int i>
+inline v_uint16x8 v_broadcast_element(v_uint16x8 v)
+{
+    unsigned char t0 = 2*i, t1 = 2*i + 1;
+    vec_uchar16 p = {t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1, t0, t1};
+    return v_uint16x8(vec_perm(v.val, v.val, p));
+}
+
+template<int i>
+inline v_int32x4 v_broadcast_element(v_int32x4 v)
+{
+    unsigned char t0 = 4*i, t1 = 4*i + 1, t2 = 4*i + 2, t3 = 4*i + 3;
+    vec_uchar16 p = {t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3};
+    return v_int32x4(vec_perm(v.val, v.val, p));
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(v_uint32x4 v)
+{
+    unsigned char t0 = 4*i, t1 = 4*i + 1, t2 = 4*i + 2, t3 = 4*i + 3;
+    vec_uchar16 p = {t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3};
+    return v_uint32x4(vec_perm(v.val, v.val, p));
+}
+
+template<int i>
+inline v_int64x2 v_broadcast_element(v_int64x2 v)
+{
+    unsigned char t0 = 8*i, t1 = 8*i + 1, t2 = 8*i + 2, t3 = 8*i + 3, t4 = 8*i + 4, t5 = 8*i + 5, t6 = 8*i + 6, t7 = 8*i + 7;
+    vec_uchar16 p = {t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7};
+    return v_int64x2(vec_perm(v.val, v.val, p));
+}
+
+template<int i>
+inline v_uint64x2 v_broadcast_element(v_uint64x2 v)
+{
+    unsigned char t0 = 8*i, t1 = 8*i + 1, t2 = 8*i + 2, t3 = 8*i + 3, t4 = 8*i + 4, t5 = 8*i + 5, t6 = 8*i + 6, t7 = 8*i + 7;
+    vec_uchar16 p = {t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7};
+    return v_uint64x2(vec_perm(v.val, v.val, p));
+}
+
+template<int i>
+inline v_float32x4 v_broadcast_element(v_float32x4 v)
+{
+    unsigned char t0 = 4*i, t1 = 4*i + 1, t2 = 4*i + 2, t3 = 4*i + 3;
+    vec_uchar16 p = {t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3, t0, t1, t2, t3};
+    return v_float32x4(vec_perm(v.val, v.val, p));
+}
+
+template<int i>
+inline v_float64x2 v_broadcast_element(v_float64x2 v)
+{
+    unsigned char t0 = 8*i, t1 = 8*i + 1, t2 = 8*i + 2, t3 = 8*i + 3, t4 = 8*i + 4, t5 = 8*i + 5, t6 = 8*i + 6, t7 = 8*i + 7;
+    vec_uchar16 p = {t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7};
+    return v_float64x2(vec_perm(v.val, v.val, p));
+}
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond
--- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp
@ -4213,6 +4213,29 @@ inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }

+template<int i, typename _Tp>
+inline typename _Tp::lane_type v_extract_n(const _Tp& a)
+{
+    return v_rotate_right<i>(a).get0();
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{
+    return v_setall_u32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{
+    return v_setall_s32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{
+    return v_setall_f32(v_extract_n<i>(a));
+}
+
+
 ////////////// FP16 support ///////////////////////////

 inline v_float32x4 v_load_expand(const float16_t* ptr)
--- a/modules/core/src/kmeans.cpp
+++ b/modules/core/src/kmeans.cpp
@ -143,6 +143,8 @@ static void generateCentersPP(const Mat& data, Mat& _out_centers,
                std::swap(tdist, tdist2);
            }
        }
+        if (bestCenter < 0)
+            CV_Error(Error::StsNoConv, "kmeans: can't update cluster center (check input for huge or NaN values)");
        centers[k] = bestCenter;
        sum0 = bestSum;
        std::swap(dist, tdist);
--- a/modules/core/src/logger.cpp
+++ b/modules/core/src/logger.cpp
@ -45,6 +45,8 @@ public:
    GlobalLoggingInitStruct()
        : logTagManager(m_defaultUnconfiguredGlobalLevel)
    {
+        (void)getInitializationMutex();  // ensure initialization of global objects
+
        applyConfigString();
        handleMalformed();
    }
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -55,11 +55,26 @@

 namespace cv {

+static void _initSystem()
+{
+#ifdef __ANDROID__
+    // https://github.com/opencv/opencv/issues/14906
+    // "ios_base::Init" object is not a part of Android's "iostream" header (in case of clang toolchain, NDK 20).
+    // Ref1: https://en.cppreference.com/w/cpp/io/ios_base/Init
+    //       The header <iostream> behaves as if it defines (directly or indirectly) an instance of std::ios_base::Init with static storage duration
+    // Ref2: https://github.com/gcc-mirror/gcc/blob/gcc-8-branch/libstdc%2B%2B-v3/include/std/iostream#L73-L74
+    static std::ios_base::Init s_iostream_initializer;
+#endif
+}
+
 static Mutex* __initialization_mutex = NULL;
 Mutex& getInitializationMutex()
 {
    if (__initialization_mutex == NULL)
+    {
+        (void)_initSystem();
        __initialization_mutex = new Mutex();
+    }
    return *__initialization_mutex;
 }
 // force initialization (single-threaded environment)
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@ -134,17 +134,21 @@ template <typename R> struct Data
    }
    const LaneType & operator[](int i) const
    {
+#if 0   // TODO: strange bug - AVX2 tests are failed with this
+        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
+#else
        CV_Assert(i >= 0 && i < R::nlanes);
+#endif
        return d[i];
    }
    LaneType & operator[](int i)
    {
-        CV_Assert(i >= 0 && i < R::nlanes);
+        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
        return d[i];
    }
    int_type as_int(int i) const
    {
-        CV_Assert(i >= 0 && i < R::nlanes);
+        CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, "");
        union
        {
            LaneType l;
@ -1190,6 +1194,40 @@ template<typename R> struct TheTest
        return *this;
    }

+    template<int s>
+    TheTest & test_extract_n()
+    {
+        SCOPED_TRACE(s);
+        Data<R> dataA;
+        LaneType test_value = (LaneType)(s + 50);
+        dataA[s] = test_value;
+        R a = dataA;
+
+        LaneType res = v_extract_n<s>(a);
+        EXPECT_EQ(test_value, res);
+
+        return *this;
+    }
+
+    template<int s>
+    TheTest & test_broadcast_element()
+    {
+        SCOPED_TRACE(s);
+        Data<R> dataA;
+        LaneType test_value = (LaneType)(s + 50);
+        dataA[s] = test_value;
+        R a = dataA;
+
+        Data<R> res = v_broadcast_element<s>(a);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            SCOPED_TRACE(i);
+            EXPECT_EQ(test_value, res[i]);
+        }
+        return *this;
+    }
+
    TheTest & test_float_math()
    {
        typedef typename V_RegTraits<R>::round_reg Ri;
@ -1498,6 +1536,7 @@ template<typename R> struct TheTest
 void test_hal_intrin_uint8()
 {
    DUMP_ENTRY(v_uint8);
+    typedef v_uint8 R;
    TheTest<v_uint8>()
        .test_loadstore()
        .test_interleave()
@ -1522,21 +1561,21 @@ void test_hal_intrin_uint8()
        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
-        ;
-
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
 #if CV_SIMD_WIDTH == 32
-    TheTest<v_uint8>()
        .test_pack<9>().test_pack<10>().test_pack<13>().test_pack<15>()
        .test_pack_u<9>().test_pack_u<10>().test_pack_u<13>().test_pack_u<15>()
        .test_extract<16>().test_extract<17>().test_extract<23>().test_extract<31>()
        .test_rotate<16>().test_rotate<17>().test_rotate<23>().test_rotate<31>()
-        ;
 #endif
+        ;
 }

 void test_hal_intrin_int8()
 {
    DUMP_ENTRY(v_int8);
+    typedef v_int8 R;
    TheTest<v_int8>()
        .test_loadstore()
        .test_interleave()
@ -1561,6 +1600,8 @@ void test_hal_intrin_int8()
        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        ;
 }

@ -1569,6 +1610,7 @@ void test_hal_intrin_int8()
 void test_hal_intrin_uint16()
 {
    DUMP_ENTRY(v_uint16);
+    typedef v_uint16 R;
    TheTest<v_uint16>()
        .test_loadstore()
        .test_interleave()
@ -1594,12 +1636,15 @@ void test_hal_intrin_uint16()
        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        ;
 }

 void test_hal_intrin_int16()
 {
    DUMP_ENTRY(v_int16);
+    typedef v_int16 R;
    TheTest<v_int16>()
        .test_loadstore()
        .test_interleave()
@ -1627,6 +1672,8 @@ void test_hal_intrin_int16()
        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        ;
 }

@ -1635,6 +1682,7 @@ void test_hal_intrin_int16()
 void test_hal_intrin_uint32()
 {
    DUMP_ENTRY(v_uint32);
+    typedef v_uint32 R;
    TheTest<v_uint32>()
        .test_loadstore()
        .test_interleave()
@ -1657,6 +1705,8 @@ void test_hal_intrin_uint32()
        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        .test_transpose()
        ;
 }
@ -1664,6 +1714,7 @@ void test_hal_intrin_uint32()
 void test_hal_intrin_int32()
 {
    DUMP_ENTRY(v_int32);
+    typedef v_int32 R;
    TheTest<v_int32>()
        .test_loadstore()
        .test_interleave()
@ -1687,6 +1738,8 @@ void test_hal_intrin_int32()
        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        .test_float_cvt32()
        .test_float_cvt64()
        .test_transpose()
@ -1698,6 +1751,7 @@ void test_hal_intrin_int32()
 void test_hal_intrin_uint64()
 {
    DUMP_ENTRY(v_uint64);
+    typedef v_uint64 R;
    TheTest<v_uint64>()
        .test_loadstore()
        .test_addsub()
@ -1709,12 +1763,15 @@ void test_hal_intrin_uint64()
        .test_reverse()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        ;
 }

 void test_hal_intrin_int64()
 {
    DUMP_ENTRY(v_int64);
+    typedef v_int64 R;
    TheTest<v_int64>()
        .test_loadstore()
        .test_addsub()
@ -1726,6 +1783,8 @@ void test_hal_intrin_int64()
        .test_reverse()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
        .test_cvt64_double()
        ;
 }
@ -1734,6 +1793,7 @@ void test_hal_intrin_int64()
 void test_hal_intrin_float32()
 {
    DUMP_ENTRY(v_float32);
+    typedef v_float32 R;
    TheTest<v_float32>()
        .test_loadstore()
        .test_interleave()
@ -1757,20 +1817,20 @@ void test_hal_intrin_float32()
        .test_reverse()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
-        ;
-
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        .test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
 #if CV_SIMD_WIDTH == 32
-    TheTest<v_float32>()
        .test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
        .test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
-        ;
 #endif
+        ;
 }

 void test_hal_intrin_float64()
 {
    DUMP_ENTRY(v_float64);
 #if CV_SIMD_64F
+    typedef v_float64 R;
    TheTest<v_float64>()
        .test_loadstore()
        .test_addsub()
@ -1787,14 +1847,13 @@ void test_hal_intrin_float64()
        .test_reverse()
        .test_extract<0>().test_extract<1>()
        .test_rotate<0>().test_rotate<1>()
-        ;
-
+        .test_extract_n<0>().test_extract_n<1>().test_extract_n<R::nlanes - 1>()
+        //.test_broadcast_element<0>().test_broadcast_element<1>().test_broadcast_element<R::nlanes - 1>()
 #if CV_SIMD_WIDTH == 32
-    TheTest<v_float64>()
        .test_extract<2>().test_extract<3>()
        .test_rotate<2>().test_rotate<3>()
+#endif
        ;
-#endif //CV_SIMD256

 #endif
 }
@ -1804,14 +1863,14 @@ void test_hal_intrin_float16()
 {
    DUMP_ENTRY(v_float16);
 #if CV_FP16
-    TheTest<v_float32>().test_loadstore_fp16_f32();
+    TheTest<v_float32>()
+        .test_loadstore_fp16_f32()
 #endif
 #if CV_SIMD_FP16
-    TheTest<v_float16>()
        .test_loadstore_fp16()
        .test_float_cvt_fp16()
-    ;
 #endif
+        ;
 }
 #endif

--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@ -2949,6 +2949,34 @@ TEST(Core_KMeans, compactness)
    }
 }

+TEST(Core_KMeans, bad_input)
+{
+    const int N = 100;
+    const int attempts = 4;
+    const TermCriteria crit = TermCriteria(TermCriteria::COUNT, 5, 0); // low number of iterations
+    const int K = 3;
+    Mat data(N, 1, CV_32FC2);
+    cv::randu(data, Scalar(-200, -200), Scalar(200, 200));
+    {
+        SCOPED_TRACE("Huge value");
+        data.at<Vec2f>(10, 0) = Vec2f(1e20f, 0);
+        Mat labels, centers;
+        EXPECT_ANY_THROW(kmeans(data, K, labels, crit, attempts, KMEANS_PP_CENTERS, centers));
+    }
+    {
+        SCOPED_TRACE("Negative value");
+        data.at<Vec2f>(10, 0) = Vec2f(0, -1e20f);
+        Mat labels, centers;
+        EXPECT_ANY_THROW(kmeans(data, K, labels, crit, attempts, KMEANS_PP_CENTERS, centers));
+    }
+    {
+        SCOPED_TRACE("NaN");
+        data.at<Vec2f>(10, 0) = Vec2f(0, std::numeric_limits<float>::quiet_NaN());
+        Mat labels, centers;
+        EXPECT_ANY_THROW(kmeans(data, K, labels, crit, attempts, KMEANS_PP_CENTERS, centers));
+    }
+}
+
 TEST(CovariationMatrixVectorOfMat, accuracy)
 {
    unsigned int col_problem_size = 8, row_problem_size = 8, vector_size = 16;
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@ -647,8 +647,6 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::CNNNetwork& net)
                    enginePtr->AddExtension(extension, 0);
 #else
                    ie.AddExtension(extension, "CPU");
-                    // OpenCV fallbacks as extensions.
-                    ie.AddExtension(std::make_shared<InfEngineExtension>(), "CPU");
 #endif
                    CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << libName);
                    found = true;
@ -661,6 +659,10 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::CNNNetwork& net)
                CV_LOG_WARNING(NULL, "DNN-IE: Can't load extension plugin (extra layers for some networks). Specify path via OPENCV_DNN_IE_EXTRA_PLUGIN_PATH parameter");
            }
            // Some of networks can work without a library of extra layers.
+#if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2019R1)
+            // OpenCV fallbacks as extensions.
+            ie.AddExtension(std::make_shared<InfEngineExtension>(), "CPU");
+#endif
 #ifndef _WIN32
            // Limit the number of CPU threads.
 #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@ -147,7 +147,8 @@ struct Integral_SIMD<uchar, int, double>
                v_expand(el8, el4l, el4h);
                el4l += prev;
                el4h += el4l;
-                prev = vx_setall_s32(v_rotate_right<v_int32::nlanes - 1>(el4h).get0());
+
+                prev = v_broadcast_element<v_int32::nlanes - 1>(el4h);
 #endif
                v_store(sum_row + j                  , el4l + vx_load(prev_sum_row + j                  ));
                v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
@ -215,7 +216,8 @@ struct Integral_SIMD<uchar, float, double>
                v_expand(el8, el4li, el4hi);
                el4l = v_cvt_f32(el4li) + prev;
                el4h = v_cvt_f32(el4hi) + el4l;
-                prev = vx_setall_f32(v_rotate_right<v_float32::nlanes - 1>(el4h).get0());
+
+                prev = v_broadcast_element<v_float32::nlanes - 1>(el4h);
 #endif
                v_store(sum_row + j                    , el4l + vx_load(prev_sum_row + j                    ));
                v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@ -137,6 +137,17 @@ private:
    PyGILState_STATE _state;
 };

+static void pyRaiseCVException(const cv::Exception &e)
+{
+    PyObject_SetAttrString(opencv_error, "file", PyString_FromString(e.file.c_str()));
+    PyObject_SetAttrString(opencv_error, "func", PyString_FromString(e.func.c_str()));
+    PyObject_SetAttrString(opencv_error, "line", PyInt_FromLong(e.line));
+    PyObject_SetAttrString(opencv_error, "code", PyInt_FromLong(e.code));
+    PyObject_SetAttrString(opencv_error, "msg", PyString_FromString(e.msg.c_str()));
+    PyObject_SetAttrString(opencv_error, "err", PyString_FromString(e.err.c_str()));
+    PyErr_SetString(opencv_error, e.what());
+}
+
 #define ERRWRAP2(expr) \
 try \
 { \
@ -145,13 +156,7 @@ try \
 } \
 catch (const cv::Exception &e) \
 { \
-    PyObject_SetAttrString(opencv_error, "file", PyString_FromString(e.file.c_str())); \
-    PyObject_SetAttrString(opencv_error, "func", PyString_FromString(e.func.c_str())); \
-    PyObject_SetAttrString(opencv_error, "line", PyInt_FromLong(e.line)); \
-    PyObject_SetAttrString(opencv_error, "code", PyInt_FromLong(e.code)); \
-    PyObject_SetAttrString(opencv_error, "msg", PyString_FromString(e.msg.c_str())); \
-    PyObject_SetAttrString(opencv_error, "err", PyString_FromString(e.err.c_str())); \
-    PyErr_SetString(opencv_error, e.what()); \
+    pyRaiseCVException(e); \
    return 0; \
 }

--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@ -359,7 +359,7 @@ struct CvCaptureCAM_V4L CV_FINAL : public CvCapture
    bool initCapture();
    bool streaming(bool startStream);
    bool setFps(int value);
-    bool tryIoctl(unsigned long ioctlCode, void *parameter) const;
+    bool tryIoctl(unsigned long ioctlCode, void *parameter, bool failIfBusy = true, int attempts = 10) const;
    bool controlInfo(int property_id, __u32 &v4l2id, cv::Range &range) const;
    bool icvControl(__u32 v4l2id, int &value, bool isSet) const;

@ -415,10 +415,10 @@ bool CvCaptureCAM_V4L::try_palette_v4l2()
    form.fmt.pix.field       = V4L2_FIELD_ANY;
    form.fmt.pix.width       = width;
    form.fmt.pix.height      = height;
-
    if (!tryIoctl(VIDIOC_S_FMT, &form))
+    {
        return false;
-
+    }
    return palette == form.fmt.pix.pixelformat;
 }

@ -490,6 +490,8 @@ bool CvCaptureCAM_V4L::autosetup_capture_mode_v4l2()
    //in case palette is already set and works, no need to setup.
    if (palette != 0 && try_palette_v4l2()) {
        return true;
+    } else if (errno == EBUSY) {
+        return false;
    }
    __u32 try_order[] = {
            V4L2_PIX_FMT_BGR24,
@ -646,7 +648,9 @@ bool CvCaptureCAM_V4L::initCapture()
    }

    if (!autosetup_capture_mode_v4l2()) {
-        fprintf(stderr, "VIDEOIO ERROR: V4L2: Pixel format of incoming image is unsupported by OpenCV\n");
+        if (errno != EBUSY) {
+            fprintf(stderr, "VIDEOIO ERROR: V4L2: Pixel format of incoming image is unsupported by OpenCV\n");
+        }
        return false;
    }

@ -866,11 +870,22 @@ bool CvCaptureCAM_V4L::read_frame_v4l2()
    return true;
 }

-bool CvCaptureCAM_V4L::tryIoctl(unsigned long ioctlCode, void *parameter) const
+bool CvCaptureCAM_V4L::tryIoctl(unsigned long ioctlCode, void *parameter, bool failIfBusy, int attempts) const
 {
-    CV_LOG_DEBUG(NULL, "tryIoctl(handle=" << deviceHandle << ", ioctl=0x" << std::hex << ioctlCode << ", ...)")
+    CV_LOG_DEBUG(NULL, "tryIoctl(handle=" << deviceHandle << ", ioctl=0x" << std::hex << ioctlCode << ", ...)");
+    if (attempts == 0) {
+        return false;
+    }
    while (-1 == ioctl(deviceHandle, ioctlCode, parameter)) {
-        if (!(errno == EBUSY || errno == EAGAIN))
+        const bool isBusy = (errno == EBUSY);
+        if (isBusy & failIfBusy) {
+            return false;
+        }
+        if ((attempts > 0) && (--attempts == 0)) {
+            return false;
+        }
+
+        if (!(isBusy || errno == EAGAIN))
            return false;

        fd_set fds;
--- a/samples/dnn/tf_text_graph_ssd.py
+++ b/samples/dnn/tf_text_graph_ssd.py
@ -283,6 +283,9 @@ def createSSDGraph(modelPath, configPath, outputPath):

    # Add layers that generate anchors (bounding boxes proposals).
    priorBoxes = []
+    boxCoder = config['box_coder'][0]
+    fasterRcnnBoxCoder = boxCoder['faster_rcnn_box_coder'][0]
+    boxCoderVariance = [1.0/float(fasterRcnnBoxCoder['x_scale'][0]), 1.0/float(fasterRcnnBoxCoder['y_scale'][0]), 1.0/float(fasterRcnnBoxCoder['width_scale'][0]), 1.0/float(fasterRcnnBoxCoder['height_scale'][0])]
    for i in range(num_layers):
        priorBox = NodeDef()
        priorBox.name = 'PriorBox_%d' % i
@ -303,7 +306,7 @@ def createSSDGraph(modelPath, configPath, outputPath):

        priorBox.addAttr('width', widths)
        priorBox.addAttr('height', heights)
-        priorBox.addAttr('variance', [0.1, 0.1, 0.2, 0.2])
+        priorBox.addAttr('variance', boxCoderVariance)

        graph_def.node.extend([priorBox])
        priorBoxes.append(priorBox.name)
@ -336,11 +339,31 @@ def createSSDGraph(modelPath, configPath, outputPath):
    detectionOut.addAttr('num_classes', num_classes + 1)
    detectionOut.addAttr('share_location', True)
    detectionOut.addAttr('background_label_id', 0)
-    detectionOut.addAttr('nms_threshold', 0.6)
-    detectionOut.addAttr('top_k', 100)
+
+    postProcessing = config['post_processing'][0]
+    batchNMS = postProcessing['batch_non_max_suppression'][0]
+
+    if 'iou_threshold' in batchNMS:
+        detectionOut.addAttr('nms_threshold', float(batchNMS['iou_threshold'][0]))
+    else:
+        detectionOut.addAttr('nms_threshold', 0.6)
+
+    if 'score_threshold' in batchNMS:
+        detectionOut.addAttr('confidence_threshold', float(batchNMS['score_threshold'][0]))
+    else:
+        detectionOut.addAttr('confidence_threshold', 0.01)
+
+    if 'max_detections_per_class' in batchNMS:
+        detectionOut.addAttr('top_k', int(batchNMS['max_detections_per_class'][0]))
+    else:
+        detectionOut.addAttr('top_k', 100)
+
+    if 'max_total_detections' in batchNMS:
+        detectionOut.addAttr('keep_top_k', int(batchNMS['max_total_detections'][0]))
+    else:
+        detectionOut.addAttr('keep_top_k', 100)
+
    detectionOut.addAttr('code_type', "CENTER_SIZE")
-    detectionOut.addAttr('keep_top_k', 100)
-    detectionOut.addAttr('confidence_threshold', 0.01)

    graph_def.node.extend([detectionOut])