Merge remote-tracking branch 'upstream/3.4' into merge-3.4

4 years ago · fa25faa2d2
parent dbab8d8c38 1067cd0649
commit fa25faa2d2
52 changed files with 1498 additions and 319 deletions
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@ -40,4 +40,5 @@ if(WITH_NEON)
    target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
 endif()
-add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")
+# we add dummy file to fix XCode build
 add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>" "${CAROTENE_SOURCE_DIR}/dummy.cpp")
--- a/3rdparty/carotene/hal/CMakeLists.txt
+++ b/3rdparty/carotene/hal/CMakeLists.txt
@ -82,7 +82,8 @@ set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
 #    set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
  endif()
-add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
+# we add dummy file to fix XCode build
 add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs> "dummy.cpp")
 set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
 set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
 if(NOT BUILD_SHARED_LIBS)
--- a/3rdparty/carotene/hal/dummy.cpp
+++ b/3rdparty/carotene/hal/dummy.cpp
@ -0,0 +1,2 @@
 // This file is needed for compilation on some platforms e.g. with XCode generator
 // Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
--- a/3rdparty/carotene/src/dummy.cpp
+++ b/3rdparty/carotene/src/dummy.cpp
@ -0,0 +1,2 @@
 // This file is needed for compilation on some platforms e.g. with XCode generator
 // Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -88,7 +88,12 @@ if(CUDA_FOUND)
  message(STATUS "CUDA detected: " ${CUDA_VERSION})
-  set(_generations "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere")
+  OCV_OPTION(CUDA_ENABLE_DEPRECATED_GENERATION "Enable deprecated generations in the list" OFF)
  set(_generations "Maxwell" "Pascal" "Volta" "Turing" "Ampere")
  if(CUDA_ENABLE_DEPRECATED_GENERATION)
    set(_generations "Fermi" "${_generations}")
    set(_generations "Kepler" "${_generations}")
  endif()
  set(_arch_fermi   "2.0")
  set(_arch_kepler  "3.0;3.5;3.7")
  set(_arch_maxwell "5.0;5.2")
@ -209,10 +214,6 @@ if(CUDA_FOUND)
    endif()
  endmacro()
  macro(ocv_wipeout_deprecated _arch_bin_list)
    string(REPLACE "2.1" "2.1(2.0)" ${_arch_bin_list} "${${_arch_bin_list}}")
  endmacro()
  set(__cuda_arch_ptx "")
  if(CUDA_GENERATION STREQUAL "Fermi")
    set(__cuda_arch_bin ${_arch_fermi})
@ -275,7 +276,6 @@ if(CUDA_FOUND)
      )
    endif()
  endif()
  ocv_wipeout_deprecated(__cuda_arch_bin)
  set(CUDA_ARCH_BIN ${__cuda_arch_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
  set(CUDA_ARCH_PTX ${__cuda_arch_ptx} CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
@ -283,10 +283,14 @@ if(CUDA_FOUND)
  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
  string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
-  # Check if user specified 1.0 compute capability: we don't support it
+  # Check if user specified 1.0/2.1 compute capability: we don't support it
-  if(" ${CUDA_ARCH_BIN} ${CUDA_ARCH_PTX}" MATCHES " 1.0")
+  macro(ocv_wipeout_deprecated_cc target_cc)
-    message(SEND_ERROR "CUDA: 1.0 compute capability is not supported - exclude it from ARCH/PTX list are re-run CMake")
+    if(" ${CUDA_ARCH_BIN} ${CUDA_ARCH_PTX}" MATCHES " ${target_cc}")
-  endif()
+      message(SEND_ERROR "CUDA: ${target_cc} compute capability is not supported - exclude it from ARCH/PTX list and re-run CMake")
    endif()
  endmacro()
  ocv_wipeout_deprecated_cc("1.0")
  ocv_wipeout_deprecated_cc("2.1")
  # NVCC flags to be set
  set(NVCC_FLAGS_EXTRA "")
--- a/doc/js_tutorials/js_assets/opencv_logo.jpg
+++ b/doc/js_tutorials/js_assets/opencv_logo.jpg
--- a/doc/opencv-logo-small.png
+++ b/doc/opencv-logo-small.png
--- a/doc/opencv-logo-white.png
+++ b/doc/opencv-logo-white.png
--- a/doc/opencv-logo.png
+++ b/doc/opencv-logo.png
--- a/doc/opencv-logo2.png
+++ b/doc/opencv-logo2.png
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@ -584,6 +584,16 @@
  pages = {1033--1040},
  publisher = {IEEE}
 }
@article{YM11,
  author = {Yu, Guoshen and Morel, Jean-Michel},
  title = {ASIFT: An Algorithm for Fully Affine Invariant Comparison},
  year = {2011},
  pages = {11--38},
  journal = {Image Processing On Line},
  volume = {1},
  doi = {10.5201/ipol.2011.my-asift},
  url = {http://www.ipol.im/pub/algo/my_affine_sift/}
 }
@inproceedings{LCS11,
  author = {Leutenegger, Stefan and Chli, Margarita and Siegwart, Roland Yves},
  title = {BRISK: Binary robust invariant scalable keypoints},
--- a/doc/opencv.ico
+++ b/doc/opencv.ico
--- a/doc/py_tutorials/py_setup/images/opencv_logo.jpg
+++ b/doc/py_tutorials/py_setup/images/opencv_logo.jpg
--- a/modules/calib3d/src/calibration_handeye.cpp
+++ b/modules/calib3d/src/calibration_handeye.cpp
@ -712,7 +712,10 @@ void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gr
    {
        Mat m = Mat::eye(4, 4, CV_64FC1);
        Mat R = m(Rect(0, 0, 3, 3));
-        R_gripper2base_[i].convertTo(R, CV_64F);
+        if(R_gripper2base_[i].size() == Size(3, 3))
            R_gripper2base_[i].convertTo(R, CV_64F);
        else
            Rodrigues(R_gripper2base_[i], R);
        Mat t = m(Rect(3, 0, 1, 3));
        t_gripper2base_[i].convertTo(t, CV_64F);
@ -727,7 +730,10 @@ void calibrateHandEye(InputArrayOfArrays R_gripper2base, InputArrayOfArrays t_gr
    {
        Mat m = Mat::eye(4, 4, CV_64FC1);
        Mat R = m(Rect(0, 0, 3, 3));
-        R_target2cam_[i].convertTo(R, CV_64F);
+        if(R_target2cam_[i].size() == Size(3, 3))
            R_target2cam_[i].convertTo(R, CV_64F);
        else
            Rodrigues(R_target2cam_[i], R);
        Mat t = m(Rect(3, 0, 1, 3));
        t_target2cam_[i].convertTo(t, CV_64F);
--- a/modules/calib3d/test/test_calibration_hand_eye.cpp
+++ b/modules/calib3d/test/test_calibration_hand_eye.cpp
@ -317,7 +317,10 @@ void CV_CalibrateHandEyeTest::simulateData(RNG& rng, int nPoses,
            t_gripper2base_noise.at<double>(2,0) += rng.gaussian(0.001);
        }
-        R_target2cam.push_back(T_target2cam(Rect(0, 0, 3, 3)));
+        // test rvec represenation
        Mat rvec_target2cam;
        cv::Rodrigues(T_target2cam(Rect(0, 0, 3, 3)), rvec_target2cam);
        R_target2cam.push_back(rvec_target2cam);
        t_target2cam.push_back(T_target2cam(Rect(3, 0, 1, 3)));
    }
 }
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -1614,7 +1614,9 @@ elements.
 CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
                            double minVal = -DBL_MAX, double maxVal = DBL_MAX);
-/** @brief converts NaN's to the given number
+/** @brief converts NaNs to the given number
@param a input/output matrix (CV_32F type).
@param val value to convert the NaNs
 */
 CV_EXPORTS_W void patchNaNs(InputOutputArray a, double val = 0);
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -600,6 +600,14 @@ CV__DNN_INLINE_NS_BEGIN
        static Ptr<RegionLayer> create(const LayerParams& params);
    };
    /**
     * @brief Detection output layer.
     *
     * The layer size is: @f$ (1 \times 1 \times N \times 7) @f$
     *    where N is [keep_top_k] parameter multiplied by batch size. Each row is:
     *    [image_id, label, confidence, xmin, ymin, xmax, ymax]
     *    where image_id is the index of image input in the batch.
     */
    class CV_EXPORTS DetectionOutputLayer : public Layer
    {
    public:
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@ -221,6 +221,10 @@ namespace cv {
                {
                    cv::dnn::LayerParams activation_param;
                    if (type == "relu")
                    {
                        activation_param.type = "ReLU";
                    }
                    else if (type == "leaky")
                    {
                        activation_param.set<float>("negative_slope", 0.1f);
                        activation_param.type = "ReLU";
@ -862,24 +866,8 @@ namespace cv {
                    }
                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
-                    if (activation == "leaky")
+                    if (activation != "linear")
-                    {
+                        setParams.setActivation(activation);
                        setParams.setActivation("relu");
                    }
                    else if (activation == "swish")
                    {
                        setParams.setActivation("swish");
                    }
                    else if (activation == "mish")
                    {
                        setParams.setActivation("mish");
                    }
                    else if (activation == "logistic")
                    {
                        setParams.setActivation("logistic");
                    }
                    else if (activation != "linear")
                        CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);
                    net->out_channels_vec[layers_counter] = tensor_shape[0];
                }
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -114,18 +114,19 @@ public:
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);
-        CV_Assert(inputs.size() > 0);
+        CV_Assert((inputs.size() > outputs.size() && blobs.empty()) ||
                  (!inputs.empty() && (blobs.size() == 1 || blobs.size() == 2)));
        MatSize weightShape = blobs.empty() ? inputs[1].size : blobs[0].size;
        CV_Assert(blobs.size() == 1 || blobs.size() == 2);
        CV_Assert(inputs[0].dims == outputs[0].dims);
-        CV_Assert(blobs[0].dims == kernel_size.size() + 2);
+        CV_Assert(weightShape.dims() == kernel_size.size() + 2);
        for (int i = 0; i < kernel_size.size(); i++) {
-            CV_Assert(blobs[0].size[i + 2] == kernel_size[i]);
+            CV_Assert(weightShape[i + 2] == kernel_size[i]);
        }
        const Mat &input = inputs[0];
        CV_Assert((input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16S));
-        for (size_t i = 0; i < inputs.size(); i++)
+        for (size_t i = 0; i < outputs.size(); i++)
        {
            CV_Assert(inputs[i].type() == input.type());
            CV_Assert((inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
@ -270,6 +271,7 @@ public:
    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
    {
        CV_Assert(!blobs.empty());
        int dims = inpShape.size();
        int inpD = dims == 5 ? inpShape[2] : 1;
        int inpH = inpShape[dims - 2];
@ -296,6 +298,8 @@ public:
        {
            if (kernel_size.size() == 3)
                return preferableTarget == DNN_TARGET_CPU;
            if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || preferableTarget != DNN_TARGET_MYRIAD) && blobs.empty())
                return false;
            return (preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height);
        }
        else
@ -305,7 +309,7 @@ public:
                return (preferableTarget == DNN_TARGET_CPU && backendId == DNN_BACKEND_OPENCV);
            else if (kernel_size.size() == 2)
                return backendId == DNN_BACKEND_OPENCV ||
-                       backendId == DNN_BACKEND_HALIDE ||
+                       (backendId == DNN_BACKEND_HALIDE && !blobs.empty()) ||
                       (backendId == DNN_BACKEND_VKCOM && haveVulkan());
            else
                return false;
@ -317,16 +321,16 @@ public:
                         std::vector<MatShape> &outputs,
                         std::vector<MatShape> &internals) const CV_OVERRIDE
    {
-        CV_Assert(blobs.size() != 0);
+        CV_Assert(!blobs.empty() || inputs.size() > 1);
-        CV_Assert(!hasBias() || blobs[1].total() == (size_t)blobs[0].size[0]);
+        const int* weightShape = blobs.empty() ? &inputs[1][0] : blobs[0].size.p;
-        CV_Assert(inputs.size() == (size_t)1);
+        CV_Assert(!hasBias() || blobs[1].total() == (size_t)weightShape[0]);
        internals.clear();
        CV_Assert(inputs.size() != 0);
        std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
-        int outCn = blobs[0].size[0];
+        int outCn = weightShape[0];
        std::vector<int> outShape;
        outShape.push_back(inputs[0][0]);
        outShape.push_back(outCn);
@ -342,10 +346,10 @@ public:
            getConvPoolOutParams(inpShape, kernel_size, strides, padMode, dilations, outShape);
        }
-        int ngroups = inpCn / blobs[0].size[1];
+        int ngroups = inpCn / weightShape[1];
-        if (ngroups == 0 || ngroups * blobs[0].size[1] != inpCn)
+        if (ngroups == 0 || ngroups * weightShape[1] != inpCn)
            CV_Error(Error::StsError, format("Number of input channels should "
-                     "be multiple of %d but got %d", blobs[0].size[1], inpCn));
+                     "be multiple of %d but got %d", weightShape[1], inpCn));
        CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0);
        outputs.resize(1, outShape);
@ -357,15 +361,15 @@ public:
    {
        BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr);
-        CV_Assert(!blobs.empty());
+        std::vector<Mat> inputs;
-        const int outCn = blobs[0].size[0];
+        inputs_arr.getMatVector(inputs);
        // prepare weightsMat where each row is aligned and has enough zero padding on the right to
        // use vectorized (i.e. with intrinsics) loops without tail processing
-        Mat wm = blobs[0].reshape(1, outCn);
+        Mat wm = blobs.empty() ? inputs[1].reshape(1, numOutput) : blobs[0].reshape(1, numOutput);
        if( wm.step1() % VEC_ALIGN != 0 )
        {
            int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
-            Mat wm_buffer = Mat(outCn, newcols, wm.type());
+            Mat wm_buffer = Mat(numOutput, newcols, wm.type());
            Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
            wm_padding.setTo(Scalar::all(0.));
            Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
@ -373,18 +377,18 @@ public:
            wm = wm_aligned;
        }
        weightsMat = wm;
-        weightsMultipliers.assign(outCn, 1.0);
+        weightsMultipliers.assign(numOutput, 1.0);
-        Mat biasMat = hasBias() ? blobs[1].reshape(1, outCn) : Mat();
+        Mat biasMat = hasBias() ? blobs[1].reshape(1, numOutput) : Mat();
-        biasvec.resize(outCn+2);
+        biasvec.resize(numOutput+2);
        if( biasMat.empty() )
        {
-            for(int i = 0; i < outCn; i++ )
+            for(int i = 0; i < numOutput; i++ )
                biasvec[i] = 0.f;
        }
        else
        {
-            for(int i = 0; i < outCn; i++ )
+            for(int i = 0; i < numOutput; i++ )
                biasvec[i] = biasMat.at<float>(i);
        }
 #ifdef HAVE_OPENCL
@ -394,7 +398,7 @@ public:
    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
    {
-        if (!activ.empty() && !layer.empty())
+        if ((!activ.empty() && !layer.empty()) || blobs.empty())
            return false;
        activ = layer;
@ -743,37 +747,48 @@ public:
    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
-        CV_Assert_N(inputs.size() == 1, nodes.size() == 1);
+        CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
        std::vector<size_t> dims = ieInpNode->get_shape();
        CV_Assert(dims.size() == 4 || dims.size() == 5);
        std::shared_ptr<ngraph::Node> ieWeights = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
        const int inpCn = dims[1];
-        const int outCn = blobs[0].size[0];
+        const int inpGroupCn = nodes.size() > 1 ? ieWeights->get_shape()[1] : blobs[0].size[1];
        const int inpGroupCn = blobs[0].size[1];
        const int group = inpCn / inpGroupCn;
-        std::vector<size_t> kernel_shape = getShape<size_t>(blobs[0]);
+        std::vector<size_t> kernel_shape;
        if (group != 1)
        {
-            kernel_shape[0] /= group;
+            kernel_shape.push_back(group);
            kernel_shape.insert(kernel_shape.begin(), group);
        }
        kernel_shape.push_back(numOutput / group);
        kernel_shape.push_back(inpCn / group);
        std::copy(kernel_size.begin(), kernel_size.end(), back_inserter(kernel_shape));
-        auto ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, blobs[0].data);
+        if (nodes.size() == 1)
        if (fusedWeights)
        {
-            if (weightsMat.isContinuous())
+            ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, blobs[0].data);
-            {
+            if (fusedWeights)
                ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, weightsMat.data);
            }
            else
            {
-                Mat newWeights;
+                if (weightsMat.isContinuous())
-                Mat cvWeights = weightsMat.colRange(0, blobs[0].total() / outCn);
+                {
-                cvWeights.copyTo(newWeights);
+                    ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, weightsMat.data);
-                ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, newWeights.data);
+                }
                else
                {
                    Mat newWeights;
                    Mat cvWeights = weightsMat.colRange(0, blobs[0].total() / numOutput);
                    cvWeights.copyTo(newWeights);
                    ieWeights = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, kernel_shape, newWeights.data);
                }
            }
        }
        else
        {
            auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
                             ngraph::Shape{kernel_shape.size()}, kernel_shape.data());
            ieWeights  = std::make_shared<ngraph::op::v1::Reshape>(ieWeights, shape, true);
        }
        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
        if (!padMode.empty())
@ -798,11 +813,21 @@ public:
                                pad_type);
        }
-        if (hasBias() || fusedBias)
+        if (hasBias() || fusedBias || nodes.size() == 3)
        {
            std::vector<size_t> shape(conv_node->get_shape().size(), 1);
-            shape[1] = outCn;
+            shape[1] = conv_node->get_shape()[1];
-            auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), biasvec.data());
+            std::shared_ptr<ngraph::Node> bias;
            if (nodes.size() == 3)
            {
                auto bias_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
                                    ngraph::Shape{shape.size()}, shape.data());
                bias = std::make_shared<ngraph::op::v1::Reshape>(nodes[2].dynamicCast<InfEngineNgraphNode>()->node, bias_shape, true);
            }
            else
            {
                bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), biasvec.data());
            }
            auto conv_bias = std::make_shared<ngraph::op::v1::Add>(conv_node, bias, ngraph::op::AutoBroadcastType::NUMPY);
            return Ptr<BackendNode>(new InfEngineNgraphNode(conv_bias));
        }
@ -1516,6 +1541,26 @@ public:
        for (int i = 0; i < inputs.size(); ++i)
            CV_Assert(inputs[i].u != outputs[0].u);
        if (blobs.empty())
        {
            size_t n = inputs.size() - 1;
            umat_blobs.resize(n);
            for (size_t i = 0; i < n; i++)
            {
                if (use_half)
                {
                    Mat matFP32;
                    convertFp16(inputs[i + 1], matFP32);
                    matFP32.copyTo(umat_blobs[i]);
                }
                else
                {
                    inputs[i + 1].copyTo(umat_blobs[i]);
                }
            }
            inputs.resize(1);
        }
        if (umat_blobs.empty())
        {
            size_t n = blobs.size();
@ -1526,7 +1571,7 @@ public:
            }
        }
-        if (convolutionOp.empty())
+        if (convolutionOp.empty() || blobs.empty())
        {
            OCL4DNNConvConfig config;
            config.in_shape = shape(inputs[0]);
@ -1536,7 +1581,7 @@ public:
            config.stride = stride;
            config.dilation = dilation;
            config.group = inputs[0].size[1] / umat_blobs[0].size[1];
-            config.bias_term = (hasBias()) ? true : false;
+            config.bias_term = umat_blobs.size() == 2;
            config.use_half = use_half;
            convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
@ -1663,16 +1708,37 @@ public:
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);
        int outCn = blobs.empty() ? inputs[1].size[0] : blobs[0].size[0];
        // Need to align non-const blobs
        if (blobs.empty())
        {
            Mat wm = inputs[1].reshape(1, outCn);
            if( wm.step1() % VEC_ALIGN != 0 )
            {
                wm.copyTo(weightsMat);
                if (inputs.size() > 2)
                {
                    Mat biasMat = inputs[2].reshape(1, outCn);
                    biasMat.col(0).copyTo(biasvec);
                    biasvec.resize(outCn + 2);
                }
                else
                {
                    biasvec.resize(outCn + 2, 0);
                }
            }
        }
        /*printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
               name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
               kernel.width, kernel.height, pad.width, pad.height,
               stride.width, stride.height, dilation.width, dilation.height);*/
-        CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % blobs[0].size[1] == 0,
+        int inpGroupCn = blobs.empty() ? inputs[1].size[1] : blobs[0].size[1];
        CV_Assert_N(inputs.size() >= (size_t)1, inputs[0].size[1] % inpGroupCn == 0,
                    outputs.size() == 1, inputs[0].data != outputs[0].data);
-        int ngroups = inputs[0].size[1]/blobs[0].size[1];
+        int ngroups = inputs[0].size[1] / inpGroupCn;
        CV_Assert(outputs[0].size[1] % ngroups == 0);
        int outCn = blobs[0].size[0];
        reluslope.clear();
        if( activ )
@ -1810,11 +1876,11 @@ public:
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
-        CV_Assert(inputs.size() == outputs.size());
+        CV_Assert(inputs.size() == outputs.size() || inputs.size() == outputs.size() + blobs.size());
        int64 flops = 0;
        int karea = std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<size_t>());
-        for (int i = 0; i < inputs.size(); i++)
+        for (int i = 0; i < outputs.size(); i++)
        {
            flops += total(outputs[i])*(CV_BIG_INT(2)*karea*inputs[i][1] + 1);
        }
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -587,7 +587,7 @@ public:
        }
        else
        {
-            std::vector<size_t> data = {(size_t)ieInpNode->get_shape()[0], (size_t)blobs[0].size[1]};
+            std::vector<int64_t> data = {(int64_t)ieInpNode->get_shape()[0], (int64_t)blobs[0].size[1]};
            auto new_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, data.data());
            auto inp = std::make_shared<ngraph::op::v1::Reshape>(ieInpNode, new_shape, true);
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@ -397,8 +397,9 @@ public:
                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
        std::vector<int64_t> order(_order.begin(), _order.end());
        auto tr_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
-                       ngraph::Shape({_order.size()}), _order.data());
+                       ngraph::Shape({order.size()}), order.data());
        auto transpose = std::make_shared<ngraph::op::Transpose>(ieInpNode, tr_axes);
        return Ptr<BackendNode>(new InfEngineNgraphNode(transpose));
    }
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -167,6 +167,10 @@ public:
    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
    {
 #ifdef HAVE_OPENCL
        ocl_exec_cache.clear();
 #endif
        std::vector<Mat> inputs, outputs;
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);
@ -221,26 +225,33 @@ public:
    }
 #ifdef HAVE_OPENCL
-    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    struct OpenCLExecInfo
    {
-        std::vector<UMat> inputs;
+        std::string kernel_name;
-        std::vector<UMat> outputs;
+        std::string build_opts;
        size_t local_size[2];
        size_t global_size[2];
-        inputs_.getUMatVector(inputs);
+        OpenCLExecInfo()
-        outputs_.getUMatVector(outputs);
+        {
            local_size[0] = local_size[1] = 0;
            global_size[0] = global_size[1] = 0;
        }
    };
    std::vector<OpenCLExecInfo> ocl_exec_cache;
    void ocl_prepare(const std::vector<UMat>& inputs, const std::vector<UMat>& outputs)
    {
        CV_TRACE_FUNCTION();
        CV_Assert(outputs.size() == finalSliceRanges.size());
        ocl_exec_cache.resize(outputs.size());
        const UMat& input = inputs[0];
-        if (input.dims > 5)
+        const int dims = input.dims;
        {
            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
            return false;
        }
        size_t WSZ = 128;
        const int dims = input.dims;
        const int elemSize = (int)input.elemSize();
        String opts0 = cv::format(
                "-DDIMS=%d -DELEMSIZE=%d",
@ -250,10 +261,11 @@ public:
        {
            opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
        }
        String kname = cv::format("slice_%d", dims);
        for (size_t i = 0; i < outputs.size(); i++)
        {
-            UMat& output = outputs[i];
+            OpenCLExecInfo& ocl = ocl_exec_cache[i];
            const UMat& output = outputs[i];
            const std::vector<Range>& range = finalSliceRanges[i];
            String opts = opts0;
@ -269,6 +281,8 @@ public:
                CV_CheckEQ(range[d].size(), (int)output.size[d], "");
            }
            const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64;
            int block_dims = 0;
            size_t block_size = elemSize;
            for (int i = dims - 1; i >= 0; --i)
@ -277,12 +291,14 @@ public:
                    break;
                block_size *= output.size[i];
                block_dims++;
                if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)
                    break;
            }
            const size_t total = output.total() * elemSize;
            size_t num_blocks = total / block_size;
-            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
+            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG))
            {
                // use 1D copy mode
                opts += cv::format(" -DUSE_COPY_1D=1");
@ -352,23 +368,98 @@ public:
            opts += cv::format(" -DWSZ=%d", (int)WSZ);
-            size_t local[] = { WSZ, 1 };
+            std::ostringstream kernel_suffix;
-            size_t global[] = { WSZ, num_blocks };
+            kernel_suffix << dims << 'x' << elemSize << "_bsz" << block_size;
            kernel_suffix << "__src_";
            for (int d = 0; d < dims; d++)
            {
                kernel_suffix << input.size[dims - 1 - d] << '_';
            }
            kernel_suffix << '_';
            /*for (int d = 0; d < dims; d++)
            {
                kernel_suffix << input.step[dims - 1 - d] << '_';
            }
            kernel_suffix << '_';*/
-            ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
+            kernel_suffix << "dst_";
            for (int d = 0; d < dims; d++)
            {
                kernel_suffix << output.size[dims - 1 - d] << '_';
            }
            /*kernel_suffix << '_';
            for (int d = 0; d < dims; d++)
            {
                kernel_suffix << output.step[dims - 1 - d] << '_';
            }*/
            kernel_suffix << "_slice_";
            for (int d = 0; d < dims; d++)
            {
                kernel_suffix << range[dims - 1 - d].start << '_';
            }
            for (int d = 0; d < dims; d++)
            {
                kernel_suffix << '_' << range[dims - 1 - d].end;
            }
            std::string kernel_suffix_str = kernel_suffix.str();
            opts += cv::format(" -DSLICE_KERNEL_SUFFIX=%s", kernel_suffix_str.c_str());
            ocl.kernel_name = cv::format("slice_%s", kernel_suffix_str.c_str());
            ocl.build_opts = opts;
            ocl.local_size[0] = WSZ;
            ocl.local_size[1] = 1;
            ocl.global_size[0] = WSZ;
            ocl.global_size[1] = num_blocks;
        }  // for outputs.size()
    }  // ocl_prepare
    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
    {
        CV_TRACE_FUNCTION();
        std::vector<UMat> inputs;
        std::vector<UMat> outputs;
        inputs_.getUMatVector(inputs);
        outputs_.getUMatVector(outputs);
        CV_Assert(outputs.size() == finalSliceRanges.size());
        const UMat& input = inputs[0];
        const int dims = input.dims;
        if (dims > 5)
        {
            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << ". Fallback to CPU");
            return false;
        }
        if (ocl_exec_cache.empty())
        {
            ocl_prepare(inputs, outputs);
        }
        CV_CheckEQ(ocl_exec_cache.size(), outputs.size(), "");
        for (size_t i = 0; i < outputs.size(); i++)
        {
            const OpenCLExecInfo& ocl = ocl_exec_cache[i];
            UMat& output = outputs[i];
            ocl::Kernel kernel(ocl.kernel_name.c_str(), ocl::dnn::slice_oclsrc, ocl.build_opts);
            if (kernel.empty())
                return false;
            bool ret = kernel.args(
                    ocl::KernelArg::PtrReadOnly(input),
                    ocl::KernelArg::PtrWriteOnly(output)
                )
-                .run(2, global, local, false);
+                .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
            if (!ret)
                return false;
        }  // for outputs.size()
        return true;
-        }
+    }  // forward_ocl
 #endif
    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -1003,10 +1003,13 @@ void ONNXImporter::populateNet(Net dstNet)
            CV_Assert(node_proto.input_size() >= 2);
            layerParams.type = "Convolution";
            for (int j = 1; j < node_proto.input_size(); j++) {
-                layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
+                if (constBlobs.find(node_proto.input(j)) != constBlobs.end())
                {
                    layerParams.blobs.push_back(getBlob(node_proto, constBlobs, j));
                }
            }
-            layerParams.set("num_output", layerParams.blobs[0].size[0]);
+            int outCn = layerParams.blobs.empty() ? outShapes[node_proto.input(1)][0] : layerParams.blobs[0].size[0];
-            layerParams.set("bias_term", node_proto.input_size() == 3);
+            layerParams.set("num_output", outCn);
        }
        else if (layer_type == "ConvTranspose")
        {
--- a/modules/dnn/src/opencl/slice.cl
+++ b/modules/dnn/src/opencl/slice.cl
@ -48,19 +48,85 @@ global: <WSZ, number_of_copy_blocks, 1>
 #define BLOCK_COLS_X4 (BLOCK_COLS / 4)
 #define BLOCK_COLS_X16 (BLOCK_COLS / 16)
-#ifdef USE_COPY_1D
+__attribute__((reqd_work_group_size(WSZ, 1, 1)))
-
+__kernel void
-static inline
+CONCAT(slice_, SLICE_KERNEL_SUFFIX)(
 __attribute__((always_inline))
 void copy_block_1d(
    __global const uchar* src0,
-    const uint src_offset,
+    __global uchar* dst0
    __global uchar* dst0,
    const uint dst_offset
 )
 {
-    __global const uchar* src = src0 + src_offset;
+    uint block_id = get_global_id(1);
-    __global uchar* dst = dst0 + dst_offset;
+    uint dst_offset0 = block_id * BLOCK_SIZE;
    uint src_offset0 = 0;
    {  // calculate src_offset0
 #define CALC_SRC_INDEX(dim) \
    { \
    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
    CONCAT(idx_, dim) = block_id / plane_sz; \
    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
    }
 #define UPDATE_SRC_OFFSET(dim) \
    src_offset0 = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset0);
 /*
    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
        printf("(%d, %d): @%d src_offset0=%d   idx_dim=%d   block_id=%d\n", \
            get_global_id(0), get_global_id(1), \
            dim, src_offset0, CONCAT(idx_, dim), block_id \
        );
 */
 #if DIMS > 5
 #error "invalid configuration"
 #endif
 #if DIMS > 4
    uint idx_4 = 0;
 #if BLOCK_DIMS <= 4
    CALC_SRC_INDEX(4)
 #endif
    UPDATE_SRC_OFFSET(4)
 #endif
 #if DIMS > 3
    uint idx_3 = 0;
 #if BLOCK_DIMS <= 3
    CALC_SRC_INDEX(3)
 #endif
    UPDATE_SRC_OFFSET(3)
 #endif
 #if DIMS > 2
    uint idx_2 = 0;
 #if BLOCK_DIMS <= 2
    CALC_SRC_INDEX(2)
 #endif
    UPDATE_SRC_OFFSET(2)
 #endif
 #if DIMS > 1
    uint idx_1 = 0;
 #if BLOCK_DIMS <= 1
    CALC_SRC_INDEX(1)
 #endif
    UPDATE_SRC_OFFSET(1)
 #endif
 #if DIMS > 0
    uint idx_0 = 0;
    UPDATE_SRC_OFFSET(0)
 #endif
 /*
    if (get_global_id(0) == 0)
        printf("(%d, %d): src_offset0=%d dst_offset0=%d\n",
            get_global_id(0), get_global_id(1),
            src_offset0, dst_offset0
        );
 */
    }  // calculate src_offset0
 #ifdef USE_COPY_1D
    {  // copy_block_1d
    __global const uchar* src = src0 + src_offset0;
    __global uchar* dst = dst0 + dst_offset0;
    uint processed = 0;
@ -70,8 +136,9 @@ void copy_block_1d(
        uint i = get_local_id(0) * 16;  // uchar16
        while (i < BLOCK_COLS_X16 * 16)
        {
-            uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ);
+            uint4 idx0 = (uint4)i;
-            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16));
+            uint4 idx = idx0 + (uint4)(0, 16 * WSZ, 32 * WSZ, 48 * WSZ);
            idx = select(idx0, idx, idx < (BLOCK_COLS_X16 * 16));
            uchar16 a0 = vload16(0, src + idx.s0);
            uchar16 a1 = vload16(0, src + idx.s1);
@ -97,8 +164,9 @@ void copy_block_1d(
        uint i = get_local_id(0) * 4 + processed;  // uchar4
        while (i < BLOCK_COLS_X4 * 4)
        {
-            uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ);
+            uint4 idx0 = (uint4)i;
-            idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4));
+            uint4 idx = idx0 + (uint4)(0, 4 * WSZ, 8 * WSZ, 12 * WSZ);
            idx = select(idx0, idx, idx < (BLOCK_COLS_X4 * 4));
            uchar4 a0 = vload4(0, src + idx.s0);
            uchar4 a1 = vload4(0, src + idx.s1);
@ -130,19 +198,11 @@ void copy_block_1d(
        }
    }
 #endif
-}
+    }  // copy_block_1d
-#else  // USE_COPY_1D
+#else
-static inline
+    {  // copy_block_2d
 __attribute__((always_inline))
 void copy_block_2d(
    __global const uchar* src0,
    const uint src_offset0,
    __global uchar* dst0,
    const uint dst_offset0
 )
 {
    __global const uchar* src = src0 + src_offset0;
    __global uchar* dst = dst0 + dst_offset0;
@ -199,85 +259,6 @@ void copy_block_2d(
 #endif  // BLOCK_COLS_FILL_X4 != BLOCK_COLS
        i += WSZ * 4;
    }
-}
+    }  // copy_block_2d
 #endif  // USE_COPY_1D
 __kernel void
 CONCAT(slice_, DIMS)(
    __global const uchar* src,
    __global uchar* dst
 )
 {
    uint block_id = get_global_id(1);
    uint dst_offset = block_id * BLOCK_SIZE;
    uint src_offset = 0;
 #define CALC_SRC_INDEX(dim) \
    { \
    uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
    CONCAT(idx_, dim) = block_id / plane_sz; \
    block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
    }
 #define UPDATE_SRC_OFFSET(dim) \
    src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset);
 /*
    if (get_global_id(0) == 0 && get_global_id(1) == 0) \
        printf("(%d, %d): @%d src_offset=%d   idx_dim=%d   block_id=%d\n", \
            get_global_id(0), get_global_id(1), \
            dim, src_offset, CONCAT(idx_, dim), block_id \
        );
 */
 #if DIMS > 5
 #error "invalid configuration"
 #endif
 #if DIMS > 4
    uint idx_4 = 0;
 #if BLOCK_DIMS <= 4
    CALC_SRC_INDEX(4)
 #endif
    UPDATE_SRC_OFFSET(4)
 #endif
 #if DIMS > 3
    uint idx_3 = 0;
 #if BLOCK_DIMS <= 3
    CALC_SRC_INDEX(3)
 #endif
    UPDATE_SRC_OFFSET(3)
 #endif
 #if DIMS > 2
    uint idx_2 = 0;
 #if BLOCK_DIMS <= 2
    CALC_SRC_INDEX(2)
 #endif
    UPDATE_SRC_OFFSET(2)
 #endif
 #if DIMS > 1
    uint idx_1 = 0;
 #if BLOCK_DIMS <= 1
    CALC_SRC_INDEX(1)
 #endif
    UPDATE_SRC_OFFSET(1)
 #endif
 #if DIMS > 0
    uint idx_0 = 0;
    UPDATE_SRC_OFFSET(0)
 #endif
 /*
    if (get_global_id(0) == 0)
        printf("(%d, %d): src_offset=%d dst_offset=%d\n",
            get_global_id(0), get_global_id(1),
            src_offset, dst_offset
        );
 */
 #ifdef USE_COPY_1D
    copy_block_1d(src, src_offset, dst, dst_offset);
 #else
    copy_block_2d(src, src_offset, dst, dst_offset);
 #endif
 }
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@ -784,6 +784,11 @@ TEST_P(Test_Darknet_layers, connected)
    testDarknetLayer("connected", true);
 }
 TEST_P(Test_Darknet_layers, relu)
 {
    testDarknetLayer("relu");
 }
 INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_layers, dnnBackendsAndTargets());
 }} // namespace
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -1133,6 +1133,9 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");
@ -1143,9 +1146,8 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
    else
        FAIL() << "Unknown backendId";
    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
    Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt"));
-    Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
+    Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
    Mat inp = blobFromNPY(_tf("blob.npy"));
@ -1165,7 +1167,10 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
    std::vector<int> outLayers = net.getUnconnectedOutLayers();
    ASSERT_EQ(net.getLayer(outLayers[0])->name, "output");
-    ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
    else
        ASSERT_EQ(net.getLayer(outLayers[0])->type, "Add");
 }
 TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
@ -1173,6 +1178,9 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");
@ -1189,12 +1197,10 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
    randu(inputs[0], 0, 255);
    inputs[0].convertTo(inputs[1], CV_32F);
    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
    Mat outs[2];
    for (int i = 0; i < 2; ++i)
    {
-        Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
+        Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin"));
        net.setPreferableBackend(backendId);
        net.setPreferableTarget(targetId);
        net.setInput(inputs[i]);
@ -1210,6 +1216,9 @@ TEST_P(Layer_Test_Convolution_DLDT, multithreading)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");
@ -1220,9 +1229,8 @@ TEST_P(Layer_Test_Convolution_DLDT, multithreading)
    else
        FAIL() << "Unknown backendId";
-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    std::string xmlPath = _tf("layer_convolution.xml");
-    std::string xmlPath = _tf("layer_convolution" + suffix + ".xml");
+    std::string binPath = _tf("layer_convolution.bin");
    std::string binPath = _tf("layer_convolution" + suffix + ".bin");
    Net firstNet = readNet(xmlPath, binPath);
    Net secondNet = readNet(xmlPath, binPath);
    Mat inp = blobFromNPY(_tf("blob.npy"));
@ -1281,8 +1289,7 @@ TEST_P(Test_DLDT_two_inputs_3dim, as_IR)
    int secondInpType = get<1>(GetParam());
    Target targetId = get<2>(GetParam());
-    std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin"));
    Net net = readNet(_tf("net_two_inputs" + suffix + ".xml"), _tf("net_two_inputs.bin"));
    std::vector<int> inpSize = get<3>(GetParam());
    Mat firstInp(3, inpSize.data(), firstInpType);
    Mat secondInp(3, inpSize.data(), secondInpType);
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@ -444,12 +444,14 @@ TEST_P(Async, model_optimizer_pipeline_set_and_forward_single)
    const Backend backendId = get<0>(get<1>(GetParam()));
    const Target targetId = get<1>(get<1>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");
-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@ -503,12 +505,14 @@ TEST_P(Async, model_optimizer_pipeline_set_and_forward_all)
    const Backend backendId = get<0>(get<1>(GetParam()));
    const Target targetId = get<1>(get<1>(GetParam()));
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");
-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@ -677,9 +681,11 @@ TEST_P(Test_Model_Optimizer, forward_two_nets)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());
-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
-    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+
    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@ -716,12 +722,14 @@ TEST_P(Test_Model_Optimizer, readFromBuffer)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    if (backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
        throw SkipTestException("No support for async forward");
-    const std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    const std::string& weightsFile = findDataFile("dnn/layers/layer_convolution.bin");
-    const std::string& weightsFile = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+    const std::string& modelFile = findDataFile("dnn/layers/layer_convolution.xml");
    const std::string& modelFile = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
@ -769,8 +777,11 @@ TEST_P(Test_Model_Optimizer, flexible_inputs)
    const Backend backendId = get<0>(GetParam());
    const Target targetId = get<1>(GetParam());
-    const std::string& model = findDataFile("dnn/layers/layer_convolution_fp16.bin");
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && targetId == DNN_TARGET_MYRIAD)
-    const std::string& proto = findDataFile("dnn/layers/layer_convolution_fp16.xml");
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
    const std::string& model = findDataFile("dnn/layers/layer_convolution.bin");
    const std::string& proto = findDataFile("dnn/layers/layer_convolution.xml");
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
        setInferenceEngineBackendType(CV_DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_API);
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -114,6 +114,62 @@ TEST_P(Test_ONNX_layers, Convolution)
    testONNXModels("convolution");
 }
 TEST_P(Test_ONNX_layers, Convolution_variable_weight)
 {
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
         backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) && target == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    String basename = "conv_variable_w";
    Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
    ASSERT_FALSE(net.empty());
    net.setPreferableBackend(backend);
    net.setPreferableTarget(target);
    for (int i = 0; i < 2; i++)
    {
        Mat input = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_0.npy"));
        Mat weights = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_1.npy"));
        Mat ref = blobFromNPY(_tf("data/output_" + basename + format("_%d", i) + ".npy"));
        net.setInput(input, "0");
        net.setInput(weights, "1");
        Mat out = net.forward();
        normAssert(ref, out, "", default_l1, default_lInf);
    }
 }
 TEST_P(Test_ONNX_layers, Convolution_variable_weight_bias)
 {
    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH ||
         backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) && target == DNN_TARGET_MYRIAD)
        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
    String basename = "conv_variable_wb";
    Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
    ASSERT_FALSE(net.empty());
    net.setPreferableBackend(backend);
    net.setPreferableTarget(target);
    for (int i = 0; i < 2; i++)
    {
        Mat input = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_0.npy"));
        Mat weights = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_1.npy"));
        Mat bias = blobFromNPY(_tf("data/input_" + basename + format("_%d", i) + "_2.npy"));
        Mat ref = blobFromNPY(_tf("data/output_" + basename + format("_%d", i) + ".npy"));
        net.setInput(input, "0");
        net.setInput(weights, "1");
        net.setInput(bias, "bias");
        Mat out = net.forward();
        normAssert(ref, out, "", default_l1, default_lInf);
    }
 }
 TEST_P(Test_ONNX_layers, Gather)
 {
    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -245,6 +245,31 @@ typedef Feature2D DescriptorExtractor;
 //! @{
 /** @brief Class for implementing the wrapper which makes detectors and extractors to be affine invariant,
 described as ASIFT in @cite YM11 .
 */
 class CV_EXPORTS_W AffineFeature : public Feature2D
 {
 public:
    /**
    @param backend The detector/extractor you want to use as backend.
    @param maxTilt The highest power index of tilt factor. 5 is used in the paper as tilt sampling range n.
    @param minTilt The lowest power index of tilt factor. 0 is used in the paper.
    @param tiltStep Tilt sampling step \f$\delta_t\f$ in Algorithm 1 in the paper.
    @param rotateStepBase Rotation sampling step factor b in Algorithm 1 in the paper.
    */
    CV_WRAP static Ptr<AffineFeature> create(const Ptr<Feature2D>& backend,
        int maxTilt = 5, int minTilt = 0, float tiltStep = 1.4142135623730951f, float rotateStepBase = 72);
    CV_WRAP virtual void setViewParams(const std::vector<float>& tilts, const std::vector<float>& rolls) = 0;
    CV_WRAP virtual void getViewParams(std::vector<float>& tilts, std::vector<float>& rolls) const = 0;
    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
 };
 typedef AffineFeature AffineFeatureDetector;
 typedef AffineFeature AffineDescriptorExtractor;
 /** @brief Class for extracting keypoints and computing descriptors using the Scale Invariant Feature Transform
 (SIFT) algorithm by D. Lowe @cite Lowe04 .
 */
--- a/modules/features2d/src/affine_feature.cpp
+++ b/modules/features2d/src/affine_feature.cpp
@ -0,0 +1,358 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
 // This file is based on code issued with the following license.
 /*********************************************************************
 * Software License Agreement (BSD License)
 *
 *  Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 *  Copyright (C) 2008-2013, Willow Garage Inc., all rights reserved.
 *  Copyright (C) 2013, Evgeny Toropov, all rights reserved.
 *  Third party copyrights are property of their respective owners.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *   * Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above
 *     copyright notice, this list of conditions and the following
 *     disclaimer in the documentation and/or other materials provided
 *     with the distribution.
 *   * The name of the copyright holders may not be used to endorse
 *     or promote products derived from this software without specific
 *     prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 *********************************************************************/
 /*
 Guoshen Yu, Jean-Michel Morel, ASIFT: An Algorithm for Fully Affine
 Invariant Comparison,  Image Processing On Line, 1 (2011), pp. 11–38.
 https://doi.org/10.5201/ipol.2011.my-asift
 */
 #include "precomp.hpp"
 #include <iostream>
 namespace cv {
 class AffineFeature_Impl CV_FINAL : public AffineFeature
 {
 public:
    explicit AffineFeature_Impl(const Ptr<Feature2D>& backend,
            int maxTilt, int minTilt, float tiltStep, float rotateStepBase);
    int descriptorSize() const CV_OVERRIDE
    {
        return backend_->descriptorSize();
    }
    int descriptorType() const CV_OVERRIDE
    {
        return backend_->descriptorType();
    }
    int defaultNorm() const CV_OVERRIDE
    {
        return backend_->defaultNorm();
    }
    void detectAndCompute(InputArray image, InputArray mask, std::vector<KeyPoint>& keypoints,
            OutputArray descriptors, bool useProvidedKeypoints=false) CV_OVERRIDE;
    void setViewParams(const std::vector<float>& tilts, const std::vector<float>& rolls) CV_OVERRIDE;
    void getViewParams(std::vector<float>& tilts, std::vector<float>& rolls) const CV_OVERRIDE;
 protected:
    void splitKeypointsByView(const std::vector<KeyPoint>& keypoints_,
            std::vector< std::vector<KeyPoint> >& keypointsByView) const;
    const Ptr<Feature2D> backend_;
    int maxTilt_;
    int minTilt_;
    float tiltStep_;
    float rotateStepBase_;
    // Tilt factors.
    std::vector<float> tilts_;
    // Roll factors.
    std::vector<float> rolls_;
 private:
    AffineFeature_Impl(const AffineFeature_Impl &); // copy disabled
    AffineFeature_Impl& operator=(const AffineFeature_Impl &); // assign disabled
 };
 AffineFeature_Impl::AffineFeature_Impl(const Ptr<FeatureDetector>& backend,
        int maxTilt, int minTilt, float tiltStep, float rotateStepBase)
    : backend_(backend), maxTilt_(maxTilt), minTilt_(minTilt), tiltStep_(tiltStep), rotateStepBase_(rotateStepBase)
 {
    int i = minTilt_;
    if( i == 0 )
    {
        tilts_.push_back(1);
        rolls_.push_back(0);
        i++;
    }
    float tilt = 1;
    for( ; i <= maxTilt_; i++ )
    {
        tilt *= tiltStep_;
        float rotateStep = rotateStepBase_ / tilt;
        int rollN = cvFloor(180.0f / rotateStep);
        if( rollN * rotateStep == 180.0f )
            rollN--;
        for( int j = 0; j <= rollN; j++ )
        {
            tilts_.push_back(tilt);
            rolls_.push_back(rotateStep * j);
        }
    }
 }
 void AffineFeature_Impl::setViewParams(const std::vector<float>& tilts,
        const std::vector<float>& rolls)
 {
    CV_Assert(tilts.size() == rolls.size());
    tilts_ = tilts;
    rolls_ = rolls;
 }
 void AffineFeature_Impl::getViewParams(std::vector<float>& tilts,
        std::vector<float>& rolls) const
 {
    tilts = tilts_;
    rolls = rolls_;
 }
 void AffineFeature_Impl::splitKeypointsByView(const std::vector<KeyPoint>& keypoints_,
        std::vector< std::vector<KeyPoint> >& keypointsByView) const
 {
    for( size_t i = 0; i < keypoints_.size(); i++ )
    {
        const KeyPoint& kp = keypoints_[i];
        CV_Assert( kp.class_id >= 0 && kp.class_id < (int)tilts_.size() );
        keypointsByView[kp.class_id].push_back(kp);
    }
 }
 class skewedDetectAndCompute : public ParallelLoopBody
 {
 public:
    skewedDetectAndCompute(
        const std::vector<float>& _tilts,
        const std::vector<float>& _rolls,
        std::vector< std::vector<KeyPoint> >& _keypointsCollection,
        std::vector<Mat>& _descriptorCollection,
        const Mat& _image,
        const Mat& _mask,
        const bool _do_keypoints,
        const bool _do_descriptors,
        const Ptr<Feature2D>& _backend)
        : tilts(_tilts),
          rolls(_rolls),
          keypointsCollection(_keypointsCollection),
          descriptorCollection(_descriptorCollection),
          image(_image),
          mask(_mask),
          do_keypoints(_do_keypoints),
          do_descriptors(_do_descriptors),
          backend(_backend) {}
    void operator()( const cv::Range& range ) const CV_OVERRIDE
    {
        CV_TRACE_FUNCTION();
        const int begin = range.start;
        const int end = range.end;
        for( int a = begin; a < end; a++ )
        {
            Mat warpedImage, warpedMask;
            Matx23f pose, invPose;
            affineSkew(tilts[a], rolls[a], warpedImage, warpedMask, pose);
            invertAffineTransform(pose, invPose);
            std::vector<KeyPoint> wKeypoints;
            Mat wDescriptors;
            if( !do_keypoints )
            {
                const std::vector<KeyPoint>& keypointsInView = keypointsCollection[a];
                if( keypointsInView.size() == 0 ) // when there are no keypoints in this affine view
                    continue;
                std::vector<Point2f> pts_, pts;
                KeyPoint::convert(keypointsInView, pts_);
                transform(pts_, pts, pose);
                wKeypoints.resize(keypointsInView.size());
                for( size_t wi = 0; wi < wKeypoints.size(); wi++ )
                {
                    wKeypoints[wi] = keypointsInView[wi];
                    wKeypoints[wi].pt = pts[wi];
                }
            }
            backend->detectAndCompute(warpedImage, warpedMask, wKeypoints, wDescriptors, !do_keypoints);
            if( do_keypoints )
            {
                // KeyPointsFilter::runByPixelsMask( wKeypoints, warpedMask );
                if( wKeypoints.size() == 0 )
                {
                    keypointsCollection[a].clear();
                    continue;
                }
                std::vector<Point2f> pts_, pts;
                KeyPoint::convert(wKeypoints, pts_);
                transform(pts_, pts, invPose);
                keypointsCollection[a].resize(wKeypoints.size());
                for( size_t wi = 0; wi < wKeypoints.size(); wi++ )
                {
                    keypointsCollection[a][wi] = wKeypoints[wi];
                    keypointsCollection[a][wi].pt = pts[wi];
                    keypointsCollection[a][wi].class_id = a;
                }
            }
            if( do_descriptors )
                wDescriptors.copyTo(descriptorCollection[a]);
        }
    }
 private:
    void affineSkew(float tilt, float phi,
            Mat& warpedImage, Mat& warpedMask, Matx23f& pose) const
    {
        int h = image.size().height;
        int w = image.size().width;
        Mat rotImage;
        Mat mask0;
        if( mask.empty() )
            mask0 = Mat(h, w, CV_8UC1, 255);
        else
            mask0 = mask;
        pose = Matx23f(1,0,0,
                    0,1,0);
        if( phi == 0 )
            image.copyTo(rotImage);
        else
        {
            phi = phi * (float)CV_PI / 180;
            float s = std::sin(phi);
            float c = std::cos(phi);
            Matx22f A(c, -s, s, c);
            Matx<float, 4, 2> corners(0, 0, (float)w, 0, (float)w,(float)h, 0, (float)h);
            Mat tf(corners * A.t());
            Mat tcorners;
            tf.convertTo(tcorners, CV_32S);
            Rect rect = boundingRect(tcorners);
            h = rect.height; w = rect.width;
            pose = Matx23f(c, -s, -(float)rect.x,
                        s,  c, -(float)rect.y);
            warpAffine(image, rotImage, pose, Size(w, h), INTER_LINEAR, BORDER_REPLICATE);
        }
        if( tilt == 1 )
            warpedImage = rotImage;
        else
        {
            float s = 0.8f * sqrt(tilt * tilt - 1);
            GaussianBlur(rotImage, rotImage, Size(0, 0), s, 0.01);
            resize(rotImage, warpedImage, Size(0, 0), 1.0/tilt, 1.0, INTER_NEAREST);
            pose(0, 0) /= tilt;
            pose(0, 1) /= tilt;
            pose(0, 2) /= tilt;
        }
        if( phi != 0 || tilt != 1 )
            warpAffine(mask0, warpedMask, pose, warpedImage.size(), INTER_NEAREST);
    }
    const std::vector<float>& tilts;
    const std::vector<float>& rolls;
    std::vector< std::vector<KeyPoint> >& keypointsCollection;
    std::vector<Mat>& descriptorCollection;
    const Mat& image;
    const Mat& mask;
    const bool do_keypoints;
    const bool do_descriptors;
    const Ptr<Feature2D>& backend;
 };
 void AffineFeature_Impl::detectAndCompute(InputArray _image, InputArray _mask,
        std::vector<KeyPoint>& keypoints,
        OutputArray _descriptors,
        bool useProvidedKeypoints)
 {
    CV_TRACE_FUNCTION();
    bool do_keypoints = !useProvidedKeypoints;
    bool do_descriptors = _descriptors.needed();
    Mat image = _image.getMat(), mask = _mask.getMat();
    Mat descriptors;
    if( (!do_keypoints && !do_descriptors) || _image.empty() )
        return;
    std::vector< std::vector<KeyPoint> > keypointsCollection(tilts_.size());
    std::vector< Mat > descriptorCollection(tilts_.size());
    if( do_keypoints )
        keypoints.clear();
    else
        splitKeypointsByView(keypoints, keypointsCollection);
    parallel_for_(Range(0, (int)tilts_.size()), skewedDetectAndCompute(tilts_, rolls_, keypointsCollection, descriptorCollection,
        image, mask, do_keypoints, do_descriptors, backend_));
    if( do_keypoints )
        for( size_t i = 0; i < keypointsCollection.size(); i++ )
        {
            const std::vector<KeyPoint>& keys = keypointsCollection[i];
            keypoints.insert(keypoints.end(), keys.begin(), keys.end());
        }
    if( do_descriptors )
    {
        _descriptors.create((int)keypoints.size(), backend_->descriptorSize(), backend_->descriptorType());
        descriptors = _descriptors.getMat();
        int iter = 0;
        for( size_t i = 0; i < descriptorCollection.size(); i++ )
        {
            const Mat& descs = descriptorCollection[i];
            if( descs.empty() )
                continue;
            Mat roi(descriptors, Rect(0, iter, descriptors.cols, descs.rows));
            descs.copyTo(roi);
            iter += descs.rows;
        }
    }
 }
 Ptr<AffineFeature> AffineFeature::create(const Ptr<Feature2D>& backend,
                                         int maxTilt, int minTilt, float tiltStep, float rotateStepBase)
 {
    CV_Assert(minTilt < maxTilt);
    CV_Assert(tiltStep > 0);
    CV_Assert(rotateStepBase > 0);
    return makePtr<AffineFeature_Impl>(backend, maxTilt, minTilt, tiltStep, rotateStepBase);
 }
 String AffineFeature::getDefaultName() const
 {
    return (Feature2D::getDefaultName() + ".AffineFeature");
 }
 } // namespace
--- a/modules/features2d/test/test_affine_feature.cpp
+++ b/modules/features2d/test/test_affine_feature.cpp
@ -0,0 +1,185 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html
 #include "test_precomp.hpp"
 // #define GENERATE_DATA // generate data in debug mode
 namespace opencv_test { namespace {
 #ifndef GENERATE_DATA
 static bool isSimilarKeypoints( const KeyPoint& p1, const KeyPoint& p2 )
 {
    const float maxPtDif = 1.f;
    const float maxSizeDif = 1.f;
    const float maxAngleDif = 2.f;
    const float maxResponseDif = 0.1f;
    float dist = (float)cv::norm( p1.pt - p2.pt );
    return (dist < maxPtDif &&
            fabs(p1.size - p2.size) < maxSizeDif &&
            abs(p1.angle - p2.angle) < maxAngleDif &&
            abs(p1.response - p2.response) < maxResponseDif &&
            (p1.octave & 0xffff) == (p2.octave & 0xffff)     // do not care about sublayers and class_id
            );
 }
 #endif
 TEST(Features2d_AFFINE_FEATURE, regression)
 {
    Mat image = imread(cvtest::findDataFile("features2d/tsukuba.png"));
    string xml = cvtest::TS::ptr()->get_data_path() + "asift/regression_cpp.xml.gz";
    ASSERT_FALSE(image.empty());
    Mat gray;
    cvtColor(image, gray, COLOR_BGR2GRAY);
    // Default ASIFT generates too large descriptors. This test uses small maxTilt to suppress the size of testdata.
    Ptr<AffineFeature> ext = AffineFeature::create(SIFT::create(), 2, 0, 1.4142135623730951f, 144.0f);
    Mat mpt, msize, mangle, mresponse, moctave, mclass_id;
 #ifdef GENERATE_DATA
    // calculate
    vector<KeyPoint> calcKeypoints;
    Mat calcDescriptors;
    ext->detectAndCompute(gray, Mat(), calcKeypoints, calcDescriptors, false);
    // create keypoints XML
    FileStorage fs(xml, FileStorage::WRITE);
    ASSERT_TRUE(fs.isOpened()) << xml;
    std::cout << "Creating keypoints XML..." << std::endl;
    mpt = Mat(calcKeypoints.size(), 2, CV_32F);
    msize = Mat(calcKeypoints.size(), 1, CV_32F);
    mangle = Mat(calcKeypoints.size(), 1, CV_32F);
    mresponse = Mat(calcKeypoints.size(), 1, CV_32F);
    moctave = Mat(calcKeypoints.size(), 1, CV_32S);
    mclass_id = Mat(calcKeypoints.size(), 1, CV_32S);
    for( size_t i = 0; i < calcKeypoints.size(); i++ )
    {
        const KeyPoint& key = calcKeypoints[i];
        mpt.at<float>(i, 0) = key.pt.x;
        mpt.at<float>(i, 1) = key.pt.y;
        msize.at<float>(i, 0) = key.size;
        mangle.at<float>(i, 0) = key.angle;
        mresponse.at<float>(i, 0) = key.response;
        moctave.at<int>(i, 0) = key.octave;
        mclass_id.at<int>(i, 0) = key.class_id;
    }
    fs << "keypoints_pt" << mpt;
    fs << "keypoints_size" << msize;
    fs << "keypoints_angle" << mangle;
    fs << "keypoints_response" << mresponse;
    fs << "keypoints_octave" << moctave;
    fs << "keypoints_class_id" << mclass_id;
    // create descriptor XML
    fs << "descriptors" << calcDescriptors;
    fs.release();
 #else
    const float badCountsRatio = 0.01f;
    const float badDescriptorDist = 1.0f;
    const float maxBadKeypointsRatio = 0.15f;
    const float maxBadDescriptorRatio = 0.15f;
    // read keypoints
    vector<KeyPoint> validKeypoints;
    Mat validDescriptors;
    FileStorage fs(xml, FileStorage::READ);
    ASSERT_TRUE(fs.isOpened()) << xml;
    fs["keypoints_pt"] >> mpt;
    ASSERT_EQ(mpt.type(), CV_32F);
    fs["keypoints_size"] >> msize;
    ASSERT_EQ(msize.type(), CV_32F);
    fs["keypoints_angle"] >> mangle;
    ASSERT_EQ(mangle.type(), CV_32F);
    fs["keypoints_response"] >> mresponse;
    ASSERT_EQ(mresponse.type(), CV_32F);
    fs["keypoints_octave"] >> moctave;
    ASSERT_EQ(moctave.type(), CV_32S);
    fs["keypoints_class_id"] >> mclass_id;
    ASSERT_EQ(mclass_id.type(), CV_32S);
    validKeypoints.resize(mpt.rows);
    for( int i = 0; i < (int)validKeypoints.size(); i++ )
    {
        validKeypoints[i].pt.x = mpt.at<float>(i, 0);
        validKeypoints[i].pt.y = mpt.at<float>(i, 1);
        validKeypoints[i].size = msize.at<float>(i, 0);
        validKeypoints[i].angle = mangle.at<float>(i, 0);
        validKeypoints[i].response = mresponse.at<float>(i, 0);
        validKeypoints[i].octave = moctave.at<int>(i, 0);
        validKeypoints[i].class_id = mclass_id.at<int>(i, 0);
    }
    // read descriptors
    fs["descriptors"] >> validDescriptors;
    fs.release();
    // calc and compare keypoints
    vector<KeyPoint> calcKeypoints;
    ext->detectAndCompute(gray, Mat(), calcKeypoints, noArray(), false);
    float countRatio = (float)validKeypoints.size() / (float)calcKeypoints.size();
    ASSERT_LT(countRatio, 1 + badCountsRatio) << "Bad keypoints count ratio.";
    ASSERT_GT(countRatio, 1 - badCountsRatio) << "Bad keypoints count ratio.";
    int badPointCount = 0, commonPointCount = max((int)validKeypoints.size(), (int)calcKeypoints.size());
    for( size_t v = 0; v < validKeypoints.size(); v++ )
    {
        int nearestIdx = -1;
        float minDist = std::numeric_limits<float>::max();
        float angleDistOfNearest = std::numeric_limits<float>::max();
        for( size_t c = 0; c < calcKeypoints.size(); c++ )
        {
            if( validKeypoints[v].class_id != calcKeypoints[c].class_id )
                continue;
            float curDist = (float)cv::norm( calcKeypoints[c].pt - validKeypoints[v].pt );
            if( curDist < minDist )
            {
                minDist = curDist;
                nearestIdx = (int)c;
                angleDistOfNearest = abs( calcKeypoints[c].angle - validKeypoints[v].angle );
            }
            else if( curDist == minDist ) // the keypoints whose positions are same but angles are different
            {
                float angleDist = abs( calcKeypoints[c].angle - validKeypoints[v].angle );
                if( angleDist < angleDistOfNearest )
                {
                    nearestIdx = (int)c;
                    angleDistOfNearest = angleDist;
                }
            }
        }
        if( nearestIdx == -1 || !isSimilarKeypoints( validKeypoints[v], calcKeypoints[nearestIdx] ) )
            badPointCount++;
    }
    float badKeypointsRatio = (float)badPointCount / (float)commonPointCount;
    std::cout << "badKeypointsRatio: " << badKeypointsRatio << std::endl;
    ASSERT_LT( badKeypointsRatio , maxBadKeypointsRatio ) << "Bad accuracy!";
    // Calc and compare descriptors. This uses validKeypoints for extraction.
    Mat calcDescriptors;
    ext->detectAndCompute(gray, Mat(), validKeypoints, calcDescriptors, true);
    int dim = validDescriptors.cols;
    int badDescriptorCount = 0;
    L1<float> distance;
    for( int i = 0; i < (int)validKeypoints.size(); i++ )
    {
        float dist = distance( validDescriptors.ptr<float>(i), calcDescriptors.ptr<float>(i), dim );
        if( dist > badDescriptorDist )
            badDescriptorCount++;
    }
    float badDescriptorRatio = (float)badDescriptorCount / (float)validKeypoints.size();
    std::cout << "badDescriptorRatio: " << badDescriptorRatio << std::endl;
    ASSERT_LT( badDescriptorRatio, maxBadDescriptorRatio ) << "Too many descriptors mismatched.";
 #endif
 }
 }} // namespace
--- a/modules/flann/include/opencv2/flann.hpp
+++ b/modules/flann/include/opencv2/flann.hpp
@ -191,8 +191,28 @@ public:
            KDTreeIndexParams( int trees = 4 );
        };
        @endcode
        - **HierarchicalClusteringIndexParams** When passing an object of this type the index constructed
        will be a hierarchical tree of clusters, dividing each set of points into n clusters whose centers
        are picked among the points without further refinement of their position.
        This algorithm fits both floating, integer and binary vectors. :
        @code
        struct HierarchicalClusteringIndexParams : public IndexParams
        {
            HierarchicalClusteringIndexParams(
                int branching = 32,
                flann_centers_init_t centers_init = CENTERS_RANDOM,
                int trees = 4,
                int leaf_size = 100);
        };
        @endcode
        - **KMeansIndexParams** When passing an object of this type the index constructed will be a
-        hierarchical k-means tree. :
+        hierarchical k-means tree (one tree by default), dividing each set of points into n clusters
        whose barycenters are refined iteratively.
        Note that this algorithm has been extended to the support of binary vectors as an alternative
        to LSH when knn search speed is the criterium. It will also outperform LSH when processing
        directly (i.e. without the use of MCA/PCA) datasets whose points share mostly the same values
        for most of the dimensions. It is recommended to set more than one tree with binary data. :
        @code
        struct KMeansIndexParams : public IndexParams
        {
@ -201,6 +221,13 @@ public:
                int iterations = 11,
                flann_centers_init_t centers_init = CENTERS_RANDOM,
                float cb_index = 0.2 );
            KMeansIndexParams(
                int branching,
                int iterations,
                flann_centers_init_t centers_init,
                float cb_index,
                int trees );
        };
        @endcode
        - **CompositeIndexParams** When using a parameters object of this type the index created
@ -219,7 +246,8 @@ public:
        - **LshIndexParams** When using a parameters object of this type the index created uses
        multi-probe LSH (by Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity Search
        by Qin Lv, William Josephson, Zhe Wang, Moses Charikar, Kai Li., Proceedings of the 33rd
-        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007) :
+        International Conference on Very Large Data Bases (VLDB). Vienna, Austria. September 2007).
        This algorithm is designed for binary vectors. :
        @code
        struct LshIndexParams : public IndexParams
        {
--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@ -404,34 +404,16 @@ public:
     */
    virtual ~HierarchicalClusteringIndex()
    {
        free_elements();
        if (root!=NULL) {
            delete[] root;
        }
        if (indices!=NULL) {
            free_indices();
            delete[] indices;
        }
    }
    /**
     * Release the inner elements of indices[]
     */
    void free_elements()
    {
        if (indices!=NULL) {
            for(int i=0; i<trees_; ++i) {
                if (indices[i]!=NULL) {
                    delete[] indices[i];
                    indices[i] = NULL;
                }
            }
        }
    }
    /**
     *  Returns size of index.
     */
@ -467,7 +449,7 @@ public:
            throw FLANNException("Branching factor must be at least 2");
        }
-        free_elements();
+        free_indices();
        for (int i=0; i<trees_; ++i) {
            indices[i] = new int[size_];
@ -503,13 +485,12 @@ public:
    void loadIndex(FILE* stream) CV_OVERRIDE
    {
        free_elements();
        if (root!=NULL) {
            delete[] root;
        }
        if (indices!=NULL) {
            free_indices();
            delete[] indices;
        }
@ -650,6 +631,20 @@ private:
    }
    /**
     * Release the inner elements of indices[]
     */
    void free_indices()
    {
        if (indices!=NULL) {
            for(int i=0; i<trees_; ++i) {
                if (indices[i]!=NULL) {
                    delete[] indices[i];
                    indices[i] = NULL;
                }
            }
        }
    }
    void computeLabels(int* dsindices, int indices_length,  int* centers, int centers_length, int* labels, DistanceType& cost)
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@ -57,8 +57,8 @@ namespace cvflann
 struct KMeansIndexParams : public IndexParams
 {
-    KMeansIndexParams(int branching = 32, int iterations = 11,
+    void indexParams(int branching, int iterations,
-                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
+                     flann_centers_init_t centers_init, float cb_index, int trees)
    {
        (*this)["algorithm"] = FLANN_INDEX_KMEANS;
        // branching factor
@ -69,6 +69,20 @@ struct KMeansIndexParams : public IndexParams
        (*this)["centers_init"] = centers_init;
        // cluster boundary index. Used when searching the kmeans tree
        (*this)["cb_index"] = cb_index;
        // number of kmeans trees to search in
        (*this)["trees"] = trees;
    }
    KMeansIndexParams(int branching = 32, int iterations = 11,
                      flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 )
    {
        indexParams(branching, iterations, centers_init, cb_index, 1);
    }
    KMeansIndexParams(int branching, int iterations,
                      flann_centers_init_t centers_init, float cb_index, int trees)
    {
        indexParams(branching, iterations, centers_init, cb_index, trees);
    }
 };
@ -347,6 +361,7 @@ public:
        veclen_ = dataset_.cols;
        branching_ = get_param(params,"branching",32);
        trees_ = get_param(params,"trees",1);
        iterations_ = get_param(params,"iterations",11);
        if (iterations_<0) {
            iterations_ = (std::numeric_limits<int>::max)();
@ -367,6 +382,13 @@ public:
        }
        cb_index_ = 0.4f;
        root_ = new KMeansNodePtr[trees_];
        indices_ = new int*[trees_];
        for (int i=0; i<trees_; ++i) {
            root_[i] = NULL;
            indices_[i] = NULL;
        }
    }
@ -382,9 +404,11 @@ public:
    virtual ~KMeansIndex()
    {
        if (root_ != NULL) {
-            free_centers(root_);
+            free_centers();
            delete[] root_;
        }
        if (indices_!=NULL) {
            free_indices();
            delete[] indices_;
        }
    }
@ -429,23 +453,24 @@ public:
            throw FLANNException("Branching factor must be at least 2");
        }
-        indices_ = new int[size_];
+        free_indices();
        for (size_t i=0; i<size_; ++i) {
            indices_[i] = int(i);
        }
-        root_ = pool_.allocate<KMeansNode>();
+        for (int i=0; i<trees_; ++i) {
-        std::memset(root_, 0, sizeof(KMeansNode));
+            indices_[i] = new int[size_];
            for (size_t j=0; j<size_; ++j) {
                indices_[i][j] = int(j);
            }
            root_[i] = pool_.allocate<KMeansNode>();
            std::memset(root_[i], 0, sizeof(KMeansNode));
-        if(is_kdtree_distance::val || is_vector_space_distance::val)
+            if(is_kdtree_distance::val || is_vector_space_distance::val) {
-        {
+                computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
-            computeNodeStatistics(root_, indices_, (unsigned int)size_);
+                computeClustering(root_[i], indices_[i], (int)size_, branching_,0);
-            computeClustering(root_, indices_, (int)size_, branching_,0);
+            }
-        }
+            else {
-        else
+                computeBitfieldNodeStatistics(root_[i], indices_[i], (unsigned int)size_);
-        {
+                computeBitfieldClustering(root_[i], indices_[i], (int)size_, branching_,0);
-            computeBitfieldNodeStatistics(root_, indices_, (unsigned int)size_);
+            }
            computeBitfieldClustering(root_, indices_, (int)size_, branching_,0);
        }
    }
@ -456,35 +481,43 @@ public:
        save_value(stream, iterations_);
        save_value(stream, memoryCounter_);
        save_value(stream, cb_index_);
-        save_value(stream, *indices_, (int)size_);
+        save_value(stream, trees_);
-
+        for (int i=0; i<trees_; ++i) {
-        save_tree(stream, root_);
+            save_value(stream, *indices_[i], (int)size_);
            save_tree(stream, root_[i], i);
        }
    }
    void loadIndex(FILE* stream) CV_OVERRIDE
    {
        if (indices_!=NULL) {
            free_indices();
            delete[] indices_;
        }
        if (root_!=NULL) {
            free_centers();
        }
        load_value(stream, branching_);
        load_value(stream, iterations_);
        load_value(stream, memoryCounter_);
        load_value(stream, cb_index_);
-        if (indices_!=NULL) {
+        load_value(stream, trees_);
            delete[] indices_;
        }
        indices_ = new int[size_];
        load_value(stream, *indices_, size_);
-        if (root_!=NULL) {
+        indices_ = new int*[trees_];
-            free_centers(root_);
+        for (int i=0; i<trees_; ++i) {
            indices_[i] = new int[size_];
            load_value(stream, *indices_[i], size_);
            load_tree(stream, root_[i], i);
        }
        load_tree(stream, root_);
        index_params_["algorithm"] = getType();
        index_params_["branching"] = branching_;
        index_params_["trees"] = trees_;
        index_params_["iterations"] = iterations_;
        index_params_["centers_init"] = centers_init_;
        index_params_["cb_index"] = cb_index_;
    }
@ -500,17 +533,21 @@ public:
    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
    {
-        int maxChecks = get_param(searchParams,"checks",32);
+        const int maxChecks = get_param(searchParams,"checks",32);
        if (maxChecks==FLANN_CHECKS_UNLIMITED) {
-            findExactNN(root_, result, vec);
+            findExactNN(root_[0], result, vec);
        }
        else {
            // Priority queue storing intermediate branches in the best-bin-first search
            Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
            int checks = 0;
-            findNN(root_, result, vec, checks, maxChecks, heap);
+            for (int i=0; i<trees_; ++i) {
                findNN(root_[i], result, vec, checks, maxChecks, heap);
                if ((checks >= maxChecks) && result.full())
                    break;
            }
            BranchSt branch;
            while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
@ -521,7 +558,6 @@ public:
            CV_Assert(result.full());
        }
    }
    /**
@ -541,7 +577,7 @@ public:
        DistanceType variance;
        KMeansNodePtr* clusters = new KMeansNodePtr[numClusters];
-        int clusterCount = getMinVarianceClusters(root_, clusters, numClusters, variance);
+        int clusterCount = getMinVarianceClusters(root_[0], clusters, numClusters, variance);
        Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount);
@ -611,23 +647,23 @@ private:
-    void save_tree(FILE* stream, KMeansNodePtr node)
+    void save_tree(FILE* stream, KMeansNodePtr node, int num)
    {
        save_value(stream, *node);
        save_value(stream, *(node->pivot), (int)veclen_);
        if (node->childs==NULL) {
-            int indices_offset = (int)(node->indices - indices_);
+            int indices_offset = (int)(node->indices - indices_[num]);
            save_value(stream, indices_offset);
        }
        else {
            for(int i=0; i<branching_; ++i) {
-                save_tree(stream, node->childs[i]);
+                save_tree(stream, node->childs[i], num);
            }
        }
    }
-    void load_tree(FILE* stream, KMeansNodePtr& node)
+    void load_tree(FILE* stream, KMeansNodePtr& node, int num)
    {
        node = pool_.allocate<KMeansNode>();
        load_value(stream, *node);
@ -636,12 +672,12 @@ private:
        if (node->childs==NULL) {
            int indices_offset;
            load_value(stream, indices_offset);
-            node->indices = indices_ + indices_offset;
+            node->indices = indices_[num] + indices_offset;
        }
        else {
            node->childs = pool_.allocate<KMeansNodePtr>(branching_);
            for(int i=0; i<branching_; ++i) {
-                load_tree(stream, node->childs[i]);
+                load_tree(stream, node->childs[i], num);
            }
        }
    }
@ -660,6 +696,32 @@ private:
        }
    }
    void free_centers()
    {
       if (root_ != NULL) {
           for(int i=0; i<trees_; ++i) {
               if (root_[i] != NULL) {
                   free_centers(root_[i]);
               }
           }
       }
    }
    /**
     * Release the inner elements of indices[]
     */
    void free_indices()
    {
        if (indices_!=NULL) {
            for(int i=0; i<trees_; ++i) {
                if (indices_[i]!=NULL) {
                    delete[] indices_[i];
                    indices_[i] = NULL;
                }
            }
        }
    }
    /**
     * Computes the statistics of a node (mean, radius, variance).
     *
@ -960,7 +1022,45 @@ private:
    }
-
+    /**
     * The method responsible with doing the recursive hierarchical clustering on
     * binary vectors.
     * As some might have heared that KMeans on binary data doesn't make sense,
     * it's worth a little explanation why it actually fairly works. As
     * with the Hierarchical Clustering algortihm, we seed several centers for the
     * current node by picking some of its points. Then in a first pass each point
     * of the node is then related to its closest center. Now let's have a look at
     * the 5 central dimensions of the 9 following points:
     *
     * xxxxxx11100xxxxx (1)
     * xxxxxx11010xxxxx (2)
     * xxxxxx11001xxxxx (3)
     * xxxxxx10110xxxxx (4)
     * xxxxxx10101xxxxx (5)
     * xxxxxx10011xxxxx (6)
     * xxxxxx01110xxxxx (7)
     * xxxxxx01101xxxxx (8)
     * xxxxxx01011xxxxx (9)
     * sum   _____
     * of 1: 66555
     *
     * Even if the barycenter notion doesn't apply, we can set a center
     * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing
     * on for these points.
     *
     * Note that convergence isn't ensured anymore. In practice, using Gonzales
     * as seeding algorithm should be fine for getting convergence ("iterations"
     * value can be set to -1). But with KMeans++ seeding you should definitely
     * set a maximum number of iterations (but make it higher than the "iterations"
     * default value of 11).
     *
     * Params:
     *     node = the node to cluster
     *     indices = indices of the points belonging to the current node
     *     indices_length = number of points in the current node
     *     branching = the branching factor to use in the clustering
     *     level = 0 for the root node, it increases with the subdivision levels
     */
    void computeBitfieldClustering(KMeansNodePtr node, int* indices,
                                   int indices_length, int branching, int level)
    {
@ -1195,8 +1295,8 @@ private:
        }
        if (node->childs==NULL) {
-            if (checks>=maxChecks) {
+            if ((checks>=maxChecks) && result.full()) {
-                if (result.full()) return;
+                return;
            }
            checks += node->size;
            for (int i=0; i<node->size; ++i) {
@ -1397,6 +1497,9 @@ private:
    /** The branching factor used in the hierarchical k-means clustering */
    int branching_;
    /** Number of kmeans trees (default is one) */
    int trees_;
    /** Maximum number of iterations to use when performing k-means clustering */
    int iterations_;
@ -1432,12 +1535,12 @@ private:
    /**
     * The root node in the tree.
     */
-    KMeansNodePtr root_;
+    KMeansNodePtr* root_;
    /**
     *  Array of indices to vectors in the dataset.
     */
-    int* indices_;
+    int** indices_;
    /**
     * The distance
--- a/modules/imgcodecs/src/grfmt_jpeg2000.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg2000.cpp
@ -378,7 +378,7 @@ bool  Jpeg2KDecoder::readComponent8u( uchar *data, void *_buffer,
    for( y = 0; y < yend - ystart; )
    {
-        jas_seqent_t* pix_row = &jas_matrix_get( buffer, y / ystep, 0 );
+        jas_seqent_t* pix_row = jas_matrix_getref( buffer, y / ystep, 0 );
        uchar* dst = data + (y - yoffset) * step - xoffset;
        if( xstep == 1 )
@ -444,7 +444,7 @@ bool  Jpeg2KDecoder::readComponent16u( unsigned short *data, void *_buffer,
    for( y = 0; y < yend - ystart; )
    {
-        jas_seqent_t* pix_row = &jas_matrix_get( buffer, y / ystep, 0 );
+        jas_seqent_t* pix_row = jas_matrix_getref( buffer, y / ystep, 0 );
        ushort* dst = data + (y - yoffset) * step - xoffset;
        if( xstep == 1 )
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -2310,7 +2310,7 @@ CV_EXPORTS_W void warpAffine( InputArray src, OutputArray dst,
                              const Scalar& borderValue = Scalar());
 /** @example samples/cpp/warpPerspective_demo.cpp
-An example program shows using cv::findHomography and cv::warpPerspective for image warping
+An example program shows using cv::getPerspectiveTransform and cv::warpPerspective for image warping
 */
 /** @brief Applies a perspective transformation to an image.
--- a/modules/stitching/include/opencv2/stitching.hpp
+++ b/modules/stitching/include/opencv2/stitching.hpp
@ -272,7 +272,7 @@ public:
    @param pano Final pano.
    @return Status code.
     */
-    Status composePanorama(InputArrayOfArrays images, OutputArray pano);
+    CV_WRAP Status composePanorama(InputArrayOfArrays images, OutputArray pano);
    /** @overload */
    CV_WRAP Status stitch(InputArrayOfArrays images, OutputArray pano);
--- a/modules/stitching/misc/python/test/test_stitching.py
+++ b/modules/stitching/misc/python/test/test_stitching.py
@ -19,6 +19,7 @@ class stitching_test(NewOpenCVTests):
        self.assertAlmostEqual(pano.shape[0], 685, delta=100, msg="rows: %r" % list(pano.shape))
        self.assertAlmostEqual(pano.shape[1], 1025, delta=100, msg="cols: %r" % list(pano.shape))
 class stitching_detail_test(NewOpenCVTests):
    def test_simple(self):
@ -82,5 +83,37 @@ class stitching_detail_test(NewOpenCVTests):
        timelapser = cv.detail.Timelapser_createDefault(cv.detail.Timelapser_CROP);
        self.assertIsNotNone(timelapser)
 class stitching_compose_panorama_test_no_args(NewOpenCVTests):
    def test_simple(self):
        img1 = self.get_sample('stitching/a1.png')
        img2 = self.get_sample('stitching/a2.png')
        stitcher = cv.Stitcher.create(cv.Stitcher_PANORAMA)
        stitcher.estimateTransform((img1, img2))
        result, _ = stitcher.composePanorama()
        assert result == 0
 class stitching_compose_panorama_args(NewOpenCVTests):
    def test_simple(self):
        img1 = self.get_sample('stitching/a1.png')
        img2 = self.get_sample('stitching/a2.png')
        stitcher = cv.Stitcher.create(cv.Stitcher_PANORAMA)
        stitcher.estimateTransform((img1, img2))
        result, _ = stitcher.composePanorama((img1, img2))
        assert result == 0
 if __name__ == '__main__':
    NewOpenCVTests.bootstrap()
--- a/samples/cpp/asift.cpp
+++ b/samples/cpp/asift.cpp
@ -0,0 +1,199 @@
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/features2d.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/calib3d.hpp>
 #include <iostream>
 #include <iomanip>
 using namespace std;
 using namespace cv;
 static void help(char** argv)
 {
    cout
    << "This is a sample usage of AffineFeature detector/extractor.\n"
    << "And this is a C++ version of samples/python/asift.py\n"
    << "Usage: " << argv[0] << "\n"
    << "     [ --feature=<sift|orb|brisk> ]         # Feature to use.\n"
    << "     [ --flann ]                            # use Flann-based matcher instead of bruteforce.\n"
    << "     [ --maxlines=<number(50 as default)> ] # The maximum number of lines in visualizing the matching result.\n"
    << "     [ --image1=<image1(aero1.jpg as default)> ]\n"
    << "     [ --image2=<image2(aero3.jpg as default)> ] # Path to images to compare."
    << endl;
 }
 static double timer()
 {
    return getTickCount() / getTickFrequency();
 }
 int main(int argc, char** argv)
 {
    vector<String> fileName;
    cv::CommandLineParser parser(argc, argv,
        "{help h ||}"
        "{feature|brisk|}"
        "{flann||}"
        "{maxlines|50|}"
        "{image1|aero1.jpg|}{image2|aero3.jpg|}");
    if (parser.has("help"))
    {
        help(argv);
        return 0;
    }
    string feature = parser.get<string>("feature");
    bool useFlann = parser.has("flann");
    int maxlines = parser.get<int>("maxlines");
    fileName.push_back(samples::findFile(parser.get<string>("image1")));
    fileName.push_back(samples::findFile(parser.get<string>("image2")));
    if (!parser.check())
    {
        parser.printErrors();
        cout << "See --help (or missing '=' between argument name and value?)" << endl;
        return 1;
    }
    Mat img1 = imread(fileName[0], IMREAD_GRAYSCALE);
    Mat img2 = imread(fileName[1], IMREAD_GRAYSCALE);
    if (img1.empty())
    {
        cerr << "Image " << fileName[0] << " is empty or cannot be found" << endl;
        return 1;
    }
    if (img2.empty())
    {
        cerr << "Image " << fileName[1] << " is empty or cannot be found" << endl;
        return 1;
    }
    Ptr<Feature2D> backend;
    Ptr<DescriptorMatcher> matcher;
    if (feature == "sift")
    {
        backend = SIFT::create();
        if (useFlann)
            matcher = DescriptorMatcher::create("FlannBased");
        else
            matcher = DescriptorMatcher::create("BruteForce");
    }
    else if (feature == "orb")
    {
        backend = ORB::create();
        if (useFlann)
            matcher = makePtr<FlannBasedMatcher>(makePtr<flann::LshIndexParams>(6, 12, 1));
        else
            matcher = DescriptorMatcher::create("BruteForce-Hamming");
    }
    else if (feature == "brisk")
    {
        backend = BRISK::create();
        if (useFlann)
            matcher = makePtr<FlannBasedMatcher>(makePtr<flann::LshIndexParams>(6, 12, 1));
        else
            matcher = DescriptorMatcher::create("BruteForce-Hamming");
    }
    else
    {
        cerr << feature << " is not supported. See --help" << endl;
        return 1;
    }
    cout << "extracting with " << feature << "..." << endl;
    Ptr<AffineFeature> ext = AffineFeature::create(backend);
    vector<KeyPoint> kp1, kp2;
    Mat desc1, desc2;
    ext->detectAndCompute(img1, Mat(), kp1, desc1);
    ext->detectAndCompute(img2, Mat(), kp2, desc2);
    cout << "img1 - " << kp1.size() << " features, "
         << "img2 - " << kp2.size() << " features"
         << endl;
    cout << "matching with " << (useFlann ? "flann" : "bruteforce") << "..." << endl;
    double start = timer();
    // match and draw
    vector< vector<DMatch> > rawMatches;
    vector<Point2f> p1, p2;
    vector<float> distances;
    matcher->knnMatch(desc1, desc2, rawMatches, 2);
    // filter_matches
    for (size_t i = 0; i < rawMatches.size(); i++)
    {
        const vector<DMatch>& m = rawMatches[i];
        if (m.size() == 2 && m[0].distance < m[1].distance * 0.75)
        {
            p1.push_back(kp1[m[0].queryIdx].pt);
            p2.push_back(kp2[m[0].trainIdx].pt);
            distances.push_back(m[0].distance);
        }
    }
    vector<uchar> status;
    vector< pair<Point2f, Point2f> > pointPairs;
    Mat H = findHomography(p1, p2, status, RANSAC);
    int inliers = 0;
    for (size_t i = 0; i < status.size(); i++)
    {
        if (status[i])
        {
            pointPairs.push_back(make_pair(p1[i], p2[i]));
            distances[inliers] = distances[i];
            // CV_Assert(inliers <= (int)i);
            inliers++;
        }
    }
    distances.resize(inliers);
    cout << "execution time: " << fixed << setprecision(2) << (timer()-start)*1000 << " ms" << endl;
    cout << inliers << " / " << status.size() << " inliers/matched" << endl;
    cout << "visualizing..." << endl;
    vector<int> indices(inliers);
    cv::sortIdx(distances, indices, SORT_EVERY_ROW+SORT_ASCENDING);
    // explore_match
    int h1 = img1.size().height;
    int w1 = img1.size().width;
    int h2 = img2.size().height;
    int w2 = img2.size().width;
    Mat vis = Mat::zeros(max(h1, h2), w1+w2, CV_8U);
    img1.copyTo(Mat(vis, Rect(0, 0, w1, h1)));
    img2.copyTo(Mat(vis, Rect(w1, 0, w2, h2)));
    cvtColor(vis, vis, COLOR_GRAY2BGR);
    vector<Point2f> corners(4);
    corners[0] = Point2f(0, 0);
    corners[1] = Point2f((float)w1, 0);
    corners[2] = Point2f((float)w1, (float)h1);
    corners[3] = Point2f(0, (float)h1);
    vector<Point2i> icorners;
    perspectiveTransform(corners, corners, H);
    transform(corners, corners, Matx23f(1,0,(float)w1,0,1,0));
    Mat(corners).convertTo(icorners, CV_32S);
    polylines(vis, icorners, true, Scalar(255,255,255));
    for (int i = 0; i < min(inliers, maxlines); i++)
    {
        int idx = indices[i];
        const Point2f& pi1 = pointPairs[idx].first;
        const Point2f& pi2 = pointPairs[idx].second;
        circle(vis, pi1, 2, Scalar(0,255,0), -1);
        circle(vis, pi2 + Point2f((float)w1,0), 2, Scalar(0,255,0), -1);
        line(vis, pi1, pi2 + Point2f((float)w1,0), Scalar(0,255,0));
    }
    if (inliers > maxlines)
        cout << "only " << maxlines << " inliers are visualized" << endl;
    imshow("affine find_obj", vis);
    // Mat vis2 = Mat::zeros(max(h1, h2), w1+w2, CV_8U);
    // Mat warp1;
    // warpPerspective(img1, warp1, H, Size(w1, h1));
    // warp1.copyTo(Mat(vis2, Rect(0, 0, w1, h1)));
    // img2.copyTo(Mat(vis2, Rect(w1, 0, w2, h2)));
    // imshow("warped", vis2);
    waitKey();
    cout << "done" << endl;
    return 0;
 }
--- a/samples/cpp/warpPerspective_demo.cpp
+++ b/samples/cpp/warpPerspective_demo.cpp
@ -8,7 +8,6 @@
 #include "opencv2/imgproc.hpp"
 #include "opencv2/imgcodecs.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/calib3d.hpp"
 #include <iostream>
 using namespace std;
@ -36,6 +35,7 @@ Mat warping(Mat image, Size warped_image_size, vector< Point2f> srcPoints, vecto
 String windowTitle = "Perspective Transformation Demo";
 String labels[4] = { "TL","TR","BR","BL" };
 vector< Point2f> roi_corners;
 vector< Point2f> midpoints(4);
 vector< Point2f> dst_corners(4);
 int roiIndex = 0;
 bool dragging;
@ -99,21 +99,26 @@ int main(int argc, char** argv)
            imshow( windowTitle, image );
            midpoints[0] = (roi_corners[0] + roi_corners[1]) / 2;
            midpoints[1] = (roi_corners[1] + roi_corners[2]) / 2;
            midpoints[2] = (roi_corners[2] + roi_corners[3]) / 2;
            midpoints[3] = (roi_corners[3] + roi_corners[0]) / 2;
            dst_corners[0].x = 0;
            dst_corners[0].y = 0;
-            dst_corners[1].x = (float)std::max(norm(roi_corners[0] - roi_corners[1]), norm(roi_corners[2] - roi_corners[3]));
+            dst_corners[1].x = (float)norm(midpoints[1] - midpoints[3]);
            dst_corners[1].y = 0;
-            dst_corners[2].x = (float)std::max(norm(roi_corners[0] - roi_corners[1]), norm(roi_corners[2] - roi_corners[3]));
+            dst_corners[2].x = dst_corners[1].x;
-            dst_corners[2].y = (float)std::max(norm(roi_corners[1] - roi_corners[2]), norm(roi_corners[3] - roi_corners[0]));
+            dst_corners[2].y = (float)norm(midpoints[0] - midpoints[2]);
            dst_corners[3].x = 0;
-            dst_corners[3].y = (float)std::max(norm(roi_corners[1] - roi_corners[2]), norm(roi_corners[3] - roi_corners[0]));
+            dst_corners[3].y = dst_corners[2].y;
            Size warped_image_size = Size(cvRound(dst_corners[2].x), cvRound(dst_corners[2].y));
-            Mat H = findHomography(roi_corners, dst_corners); //get homography
+            Mat M = getPerspectiveTransform(roi_corners, dst_corners);
            Mat warped_image;
-            warpPerspective(original_image, warped_image, H, warped_image_size); // do perspective transformation
+            warpPerspective(original_image, warped_image, M, warped_image_size); // do perspective transformation
            imshow("Warped Image", warped_image);
        }
--- a/samples/data/opencv-logo-white.png
+++ b/samples/data/opencv-logo-white.png
--- a/samples/data/opencv-logo.png
+++ b/samples/data/opencv-logo.png
--- a/samples/winrt/ImageManipulations/assets/StoreLogo.png
+++ b/samples/winrt/ImageManipulations/assets/StoreLogo.png
--- a/samples/winrt/ImageManipulations/assets/opencv-logo-150.png
+++ b/samples/winrt/ImageManipulations/assets/opencv-logo-150.png
--- a/samples/winrt/ImageManipulations/assets/opencv-logo-30.png
+++ b/samples/winrt/ImageManipulations/assets/opencv-logo-30.png
--- a/samples/winrt/ImageManipulations/assets/windows-sdk.scale-100.png
+++ b/samples/winrt/ImageManipulations/assets/windows-sdk.scale-100.png
--- a/samples/winrt/JavaScript/images/logo.scale-100.png
+++ b/samples/winrt/JavaScript/images/logo.scale-100.png
--- a/samples/winrt/JavaScript/images/smalllogo.scale-100.png
+++ b/samples/winrt/JavaScript/images/smalllogo.scale-100.png
--- a/samples/winrt/JavaScript/images/windows-sdk.png
+++ b/samples/winrt/JavaScript/images/windows-sdk.png
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/Logo.png
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/Logo.png
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/SmallLogo.png
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/SmallLogo.png
--- a/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/StoreLogo.png
+++ b/samples/winrt/OcvImageProcessing/OcvImageProcessing/Assets/StoreLogo.png
		`@ -0,0 +1,2 @@`
							`// This file is needed for compilation on some platforms e.g. with XCode generator`
							`// Related issue: https://gitlab.kitware.com/cmake/cmake/-/issues/17457`