diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index e3ae62fae7..2202cdff3b 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -123,6 +123,9 @@ if(CV_GCC OR CV_CLANG)
   add_extra_compiler_option(-Wsign-promo)
   add_extra_compiler_option(-Wuninitialized)
   add_extra_compiler_option(-Winit-self)
+  if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0))
+    add_extra_compiler_option(-Wno-psabi)
+  endif()
   if(HAVE_CXX11)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT ENABLE_PRECOMPILED_HEADERS)
       add_extra_compiler_option(-Wsuggest-override)
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
index a4d2c29d34..d3f78beb8e 100644
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -845,36 +845,24 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
 /** Mask **/
 inline int v_signmask(const v_uint8x16& a)
 {
-    vec_uchar16 sv  = vec_sr(a.val, vec_uchar16_sp(7));
-    static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
-    sv = vec_sl(sv, slm);
-    vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
-    static const vec_uint4 slm4 = {0, 0, 8, 8};
-    sv4 = vec_sl(sv4, slm4);
-    return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
+    static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
 }
 inline int v_signmask(const v_int8x16& a)
 { return v_signmask(v_reinterpret_as_u8(a)); }
 
 inline int v_signmask(const v_int16x8& a)
 {
-    static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7};
-    vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15));
-    sv = vec_sl(sv, slm);
-    vec_int4 svi = vec_int4_z;
-    svi = vec_sums(vec_sum4s(sv, svi), svi);
-    return vec_extract(svi, 3);
+    static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
 }
 inline int v_signmask(const v_uint16x8& a)
 { return v_signmask(v_reinterpret_as_s16(a)); }
 
 inline int v_signmask(const v_int32x4& a)
 {
-    static const vec_uint4 slm = {0, 1, 2, 3};
-    vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31));
-    sv = vec_sl(sv, slm);
-    sv = vec_sums(sv, vec_int4_z);
-    return vec_extract(sv, 3);
+    static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
 }
 inline int v_signmask(const v_uint32x4& a)
 { return v_signmask(v_reinterpret_as_s32(a)); }
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index d78a443c2c..a459a06c5c 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -554,7 +554,9 @@ struct HWFeatures
         have[CV_CPU_FP16] = true;
     #endif
     #endif
-
+    #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
+        have[CV_CPU_NEON] = true;
+    #endif
     // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
     have[CV_CPU_VSX] = (CV_VSX);
     // TODO: Check VSX3 availability in runtime for other platforms
diff --git a/modules/core/test/test_ptr.cpp b/modules/core/test/test_ptr.cpp
index 885516d1b6..002bfa6c01 100644
--- a/modules/core/test/test_ptr.cpp
+++ b/modules/core/test/test_ptr.cpp
@@ -160,14 +160,7 @@ TEST(Core_Ptr, assignment)
 
     {
         Ptr<Reporter> p1(new Reporter(&deleted1));
-#if defined(__clang__) && (__clang_major__ >= 9) && !defined(__APPLE__)
-CV_DO_PRAGMA(GCC diagnostic push)
-CV_DO_PRAGMA(GCC diagnostic ignored "-Wself-assign-overloaded")
-#endif
-        p1 = p1;
-#if defined(__clang__) && (__clang_major__ >= 9) && !defined(__APPLE__)
-CV_DO_PRAGMA(GCC diagnostic pop)
-#endif
+        p1 = *&p1;
         EXPECT_FALSE(deleted1);
     }
 
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index f1c2eb71d6..fa6eadfb8d 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -37,7 +37,9 @@ else()
                                        -Wunused-parameter -Wsign-compare
   )
 endif()
-
+if(HAVE_CUDA)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+endif()
 if(NOT HAVE_CXX11)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-undef)  # LANG_CXX11 from protobuf files
 endif()
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index fa4b2f9349..c1905cb9bf 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -123,9 +123,12 @@ PERF_TEST_P_(DNNTestNetwork, SSD)
 
 PERF_TEST_P_(DNNTestNetwork, OpenFace)
 {
-    if (backend == DNN_BACKEND_HALIDE ||
-       (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
+    if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
+        throw SkipTestException("");
+#endif
     processNet("dnn/openface_nn4.small2.v1.t7", "", "",
             Mat(cv::Size(96, 96), CV_32FC3));
 }
@@ -185,16 +188,6 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
 {
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
-#if defined(INF_ENGINE_RELEASE)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
-        throw SkipTestException("Test is disabled for MyriadX");
-#endif
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("Test is disabled for Myriad in OpenVINO 2019R2");
-#endif
-
     processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
             Mat(cv::Size(300, 300), CV_32FC3));
 }
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 3c0b8cff26..f2a3a7bf08 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -719,21 +719,23 @@ struct DataLayer : public Layer
         CV_Assert(numChannels <= 4);
 
         // Scale
-        auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
-                                                                {numChannels});
+        InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels},
+                                       InferenceEngine::Layout::C);
+        auto weights = InferenceEngine::make_shared_blob<float>(td);
         weights->allocate();
-        weights->set(std::vector<float>(numChannels, scaleFactors[0]));
+
+        float* weight_buf = weights->buffer().as<float*>();
+        std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]);
 
         // Mean subtraction
-        auto biases = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
-                                                               {numChannels});
+        auto biases = InferenceEngine::make_shared_blob<float>(td);
         biases->allocate();
-        std::vector<float> biasesVec(numChannels);
+        float* bias_buf = biases->buffer().as<float*>();
+
         for (int i = 0; i < numChannels; ++i)
         {
-            biasesVec[i] = -means[0][i] * scaleFactors[0];
+            bias_buf[i] = -means[0][i] * scaleFactors[0];
         }
-        biases->set(biasesVec);
 
         InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
         addConstantData("weights", weights, ieLayer);
@@ -1536,7 +1538,11 @@ struct Net::Impl
                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
                 {
                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
                     dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
+#else
+                    dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
+#endif
                 }
             }
             else
@@ -1544,7 +1550,11 @@ struct Net::Impl
                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
                 {
                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
                     dataPtr->name = ld.name;
+#else
+                    dataPtr->setName(ld.name);
+#endif
                 }
             }
         }
@@ -1565,7 +1575,11 @@ struct Net::Impl
                     for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
                     {
                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
                         dataPtr->name = netInputLayer->outNames[i];
+#else
+                        dataPtr->setName(netInputLayer->outNames[i]);
+#endif
                     }
                 }
                 else
@@ -1573,7 +1587,11 @@ struct Net::Impl
                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
                     {
                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
                         dataPtr->name = ld.name;
+#else
+                        dataPtr->setName(ld.name);
+#endif
                     }
                 }
                 ieNode->net->addBlobs(ld.inputBlobsWrappers);
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 88654623ac..ef44ed79c4 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -111,7 +111,8 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
-        CV_Assert(!input->dims.empty());
+        std::vector<size_t> dims = input->getDims();
+        CV_Assert(!dims.empty());
 
         InferenceEngine::Builder::Layer ieLayer(name);
         ieLayer.setName(name);
@@ -122,12 +123,10 @@ public:
         else
         {
             ieLayer.setType("Split");
-            ieLayer.getParameters()["axis"] = input->dims.size() - 1;
-            ieLayer.getParameters()["out_sizes"] = input->dims[0];
+            ieLayer.getParameters()["axis"] = dims.size() - 1;
+            ieLayer.getParameters()["out_sizes"] = dims[0];
         }
-        std::vector<size_t> shape(input->dims);
-        std::reverse(shape.begin(), shape.end());
-        ieLayer.setInputPorts({InferenceEngine::Port(shape)});
+        ieLayer.setInputPorts({InferenceEngine::Port(dims)});
         ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
     }
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index 72baba71e6..aae9bdea1a 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -316,7 +316,7 @@ public:
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
 
         InferenceEngine::Builder::ConcatLayer ieLayer(name);
-        ieLayer.setAxis(clamp(axis, input->dims.size()));
+        ieLayer.setAxis(clamp(axis, input->getDims().size()));
         ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
     }
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 83e881381c..42a2597af6 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -541,15 +541,14 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
     {
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
-        CV_Assert(input->dims.size() == 4 || input->dims.size() == 5);
-
-        const int inpCn = input->dims[input->dims.size() - 2];  // NOTE: input->dims are reversed (WHIO or WHDIO)
+        std::vector<size_t> dims = input->getDims();
+        CV_Assert(dims.size() == 4 || dims.size() == 5);
+        const int inpCn = dims[1];
         const int outCn = blobs[0].size[0];
         const int inpGroupCn = blobs[0].size[1];
         const int group = inpCn / inpGroupCn;
-
-        InferenceEngine::Layout layout = (input->dims.size() == 4) ? InferenceEngine::Layout::OIHW :
-                                                                     InferenceEngine::Layout::NCDHW;
+        InferenceEngine::Layout layout = (dims.size() == 4) ? InferenceEngine::Layout::OIHW :
+                                                              InferenceEngine::Layout::NCDHW;
 
         auto ieWeights = wrapToInfEngineBlob(blobs[0], layout);
         if (fusedWeights)
@@ -561,9 +560,10 @@ public:
             }
             else
             {
-                ieWeights = InferenceEngine::make_shared_blob<float>(
-                                    InferenceEngine::Precision::FP32, layout,
-                                    ieWeights->dims());
+                ieWeights = InferenceEngine::make_shared_blob<float>({
+                                InferenceEngine::Precision::FP32,
+                                ieWeights->getTensorDesc().getDims(), layout
+                            });
                 ieWeights->allocate();
 
                 Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn);
@@ -1953,9 +1953,10 @@ public:
         auto ieWeights = wrapToInfEngineBlob(blobs[0], layout);
         if (fusedWeights)
         {
-            ieWeights = InferenceEngine::make_shared_blob<float>(
-                                InferenceEngine::Precision::FP32, layout,
-                                ieWeights->dims());
+            ieWeights = InferenceEngine::make_shared_blob<float>({
+                            InferenceEngine::Precision::FP32,
+                            ieWeights->getTensorDesc().getDims(), layout
+                        });
             ieWeights->allocate();
 
             int inpCn = blobs[0].size[0];
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index 09fac59078..b6b973d226 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -261,7 +261,8 @@ public:
     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
     {
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
-        if (input->dims.size() == 4)
+        std::vector<size_t> dims = input->getDims();
+        if (dims.size() == 4)
         {
             InferenceEngine::Builder::NormalizeLayer ieLayer(name);
 
@@ -270,13 +271,14 @@ public:
             ieLayer.setEpsilon(epsilon);
 
             InferenceEngine::Builder::Layer l = ieLayer;
-            const int numChannels = input->dims[2];  // NOTE: input->dims are reversed (whcn)
+            const int numChannels = dims[1];
             InferenceEngine::Blob::Ptr weights;
             if (blobs.empty())
             {
-                weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
-                                                                   InferenceEngine::Layout::C,
-                                                                   {(size_t)numChannels});
+                weights = InferenceEngine::make_shared_blob<float>({
+                              InferenceEngine::Precision::FP32,
+                              {(size_t)numChannels}, InferenceEngine::Layout::C
+                          });
                 weights->allocate();
 
                 Mat weightsMat = infEngineBlobToMat(weights).reshape(1, numChannels);
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 2ec4be17be..d8bdff96cc 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -167,9 +167,11 @@ public:
             if (kernel_size.size() == 3)
                 return preferableTarget == DNN_TARGET_CPU;
             if (preferableTarget == DNN_TARGET_MYRIAD) {
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
                 if (type == MAX && (pad_l == 1 && pad_t == 1) && stride == Size(2, 2) ) {
                     return !isMyriadX();
                 }
+#endif
                 return type == MAX || type == AVE;
             }
             else
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 5e22519c39..4486a0f6de 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -207,12 +207,13 @@ public:
         }
         else
         {
-            auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
-                                                                    {numChannels});
+            auto weights = InferenceEngine::make_shared_blob<float>({
+                               InferenceEngine::Precision::FP32, {(size_t)numChannels},
+                               InferenceEngine::Layout::C
+                           });
             weights->allocate();
-
-            std::vector<float> ones(numChannels, 1);
-            weights->set(ones);
+            float* buf = weights->buffer().as<float*>();
+            std::fill(buf, buf + numChannels, 1);
             addConstantData("weights", weights, l);
         }
         if (hasBias)
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 7640d4637e..430555161b 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -301,14 +301,14 @@ public:
         {
             std::vector<size_t> outShape(numDims);
             for (int i = 0; i < numDims; ++i)
-                outShape[numDims - 1 - i] = sliceRanges[0][i].size();
+                outShape[i] = sliceRanges[0][i].size();
 
             ieLayer.getInputPorts()[1].setParameter("type", "weights");
 
-            // Fake blob which will be moved to inputs (as weights).
-            auto shapeSource = InferenceEngine::make_shared_blob<float>(
-                                   InferenceEngine::Precision::FP32,
-                                   InferenceEngine::Layout::ANY, outShape);
+            auto shapeSource = InferenceEngine::make_shared_blob<float>({
+                                   InferenceEngine::Precision::FP32, outShape,
+                                   InferenceEngine::Layout::ANY
+                               });
             shapeSource->allocate();
             addConstantData("weights", shapeSource, ieLayer);
         }
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 127abb20d0..59c8163492 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -329,7 +329,8 @@ public:
         InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
 
         InferenceEngine::Builder::SoftMaxLayer ieLayer(name);
-        ieLayer.setAxis(clamp(axisRaw, input->dims.size()));
+        ieLayer.setAxis(clamp(axisRaw, input->getDims().size()));
+
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
     }
 #endif  // HAVE_INF_ENGINE
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index 2635c4dc73..71e9a7b8aa 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -45,13 +45,13 @@ infEngineWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
 InfEngineBackendNet::InfEngineBackendNet() : netBuilder("")
 {
     hasNetOwner = false;
-    targetDevice = InferenceEngine::TargetDevice::eCPU;
+    device_name = "CPU";
 }
 
 InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) : netBuilder(""), cnn(net)
 {
     hasNetOwner = true;
-    targetDevice = InferenceEngine::TargetDevice::eCPU;
+    device_name = "CPU";
 }
 
 void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& inputs,
@@ -66,16 +66,13 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
     for (size_t i = 0; i < inpWrappers.size(); ++i)
     {
         const auto& inp = inpWrappers[i];
-        const std::string& inpName = inp->dataPtr->name;
+        const std::string& inpName = inp->dataPtr->getName();
         int inpId;
         it = layers.find(inpName);
         if (it == layers.end())
         {
             InferenceEngine::Builder::InputLayer inpLayer(!inpName.empty() ? inpName : kDefaultInpLayerName);
-
-            std::vector<size_t> shape(inp->blob->dims());
-            std::reverse(shape.begin(), shape.end());
-
+            std::vector<size_t> shape(inp->blob->getTensorDesc().getDims());
             inpLayer.setPort(InferenceEngine::Port(shape));
             inpId = netBuilder.addLayer(inpLayer);
 
@@ -89,7 +86,11 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
     }
     CV_Assert(!outputs.empty());
     InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]);
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
     dataPtr->name = layerName;
+#else
+    dataPtr->setName(layerName);
+#endif
 }
 
 void InfEngineBackendNet::init(int targetId)
@@ -115,21 +116,22 @@ void InfEngineBackendNet::init(int targetId)
 
     switch (targetId)
     {
-    case DNN_TARGET_CPU:
-        targetDevice = InferenceEngine::TargetDevice::eCPU;
-        break;
-    case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL_FP16:
-        targetDevice = InferenceEngine::TargetDevice::eGPU;
-        break;
-    case DNN_TARGET_MYRIAD:
-        targetDevice = InferenceEngine::TargetDevice::eMYRIAD;
-        break;
-    case DNN_TARGET_FPGA:
-        targetDevice = InferenceEngine::TargetDevice::eFPGA;
-        break;
-    default:
-        CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId));
-    }
+        case DNN_TARGET_CPU:
+            device_name = "CPU";
+            break;
+        case DNN_TARGET_OPENCL:
+        case DNN_TARGET_OPENCL_FP16:
+            device_name = "GPU";
+            break;
+        case DNN_TARGET_MYRIAD:
+            device_name = "MYRIAD";
+            break;
+        case DNN_TARGET_FPGA:
+            device_name = "FPGA";
+            break;
+        default:
+            CV_Error(Error::StsNotImplemented, "Unknown target");
+    };
 
     for (const auto& name : requestedOutputs)
     {
@@ -141,14 +143,14 @@ void InfEngineBackendNet::init(int targetId)
         const std::string& name = it.first;
         auto blobIt = allBlobs.find(name);
         CV_Assert(blobIt != allBlobs.end());
-        it.second->setPrecision(blobIt->second->precision());
+        it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision());
     }
     for (const auto& it : cnn.getOutputsInfo())
     {
         const std::string& name = it.first;
         auto blobIt = allBlobs.find(name);
         CV_Assert(blobIt != allBlobs.end());
-        it.second->setPrecision(blobIt->second->precision());  // Should be always FP32
+        it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision());  // Should be always FP32
     }
 
     initPlugin(cnn);
@@ -223,16 +225,13 @@ static InferenceEngine::Layout estimateLayout(const Mat& m)
 
 static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std::string& name = "")
 {
-    std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
-    std::reverse(reversedShape.begin(), reversedShape.end());
+    std::vector<size_t> shape(&m.size[0], &m.size[0] + m.dims);
     if (m.type() == CV_32F)
-        return InferenceEngine::DataPtr(
-            new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, estimateLayout(m))
-        );
+        return InferenceEngine::DataPtr(new InferenceEngine::Data(name,
+               {InferenceEngine::Precision::FP32, shape, estimateLayout(m)}));
     else if (m.type() == CV_8U)
-        return InferenceEngine::DataPtr(
-            new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::U8, estimateLayout(m))
-        );
+        return InferenceEngine::DataPtr(new InferenceEngine::Data(name,
+               {InferenceEngine::Precision::U8, shape, estimateLayout(m)}));
     else
         CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type()));
 }
@@ -241,33 +240,33 @@ InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<s
                                                InferenceEngine::Layout layout)
 {
     if (m.type() == CV_32F)
-        return InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
-                                                        layout, shape, (float*)m.data);
+        return InferenceEngine::make_shared_blob<float>(
+               {InferenceEngine::Precision::FP32, shape, layout}, (float*)m.data);
     else if (m.type() == CV_8U)
-        return InferenceEngine::make_shared_blob<uint8_t>(InferenceEngine::Precision::U8,
-                                                          layout, shape, (uint8_t*)m.data);
+        return InferenceEngine::make_shared_blob<uint8_t>(
+               {InferenceEngine::Precision::U8, shape, layout}, (uint8_t*)m.data);
     else
         CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type()));
 }
 
 InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout)
 {
-    std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims);
-    std::reverse(reversedShape.begin(), reversedShape.end());
-    return wrapToInfEngineBlob(m, reversedShape, layout);
+    std::vector<size_t> shape(&m.size[0], &m.size[0] + m.dims);
+    return wrapToInfEngineBlob(m, shape, layout);
 }
 
 InferenceEngine::Blob::Ptr cloneBlob(const InferenceEngine::Blob::Ptr& blob)
 {
-    InferenceEngine::Precision precision = blob->precision();
     InferenceEngine::Blob::Ptr copy;
+    auto description = blob->getTensorDesc();
+    InferenceEngine::Precision precision = description.getPrecision();
     if (precision == InferenceEngine::Precision::FP32)
     {
-        copy = InferenceEngine::make_shared_blob<float>(precision, blob->layout(), blob->dims());
+        copy = InferenceEngine::make_shared_blob<float>(description);
     }
     else if (precision == InferenceEngine::Precision::U8)
     {
-        copy = InferenceEngine::make_shared_blob<uint8_t>(precision, blob->layout(), blob->dims());
+        copy = InferenceEngine::make_shared_blob<uint8_t>(description);
     }
     else
         CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
@@ -296,10 +295,8 @@ InfEngineBackendWrapper::InfEngineBackendWrapper(Ptr<BackendWrapper> wrapper)
     Ptr<InfEngineBackendWrapper> ieWrapper = wrapper.dynamicCast<InfEngineBackendWrapper>();
     CV_Assert(!ieWrapper.empty());
     InferenceEngine::DataPtr srcData = ieWrapper->dataPtr;
-    dataPtr = InferenceEngine::DataPtr(
-        new InferenceEngine::Data(srcData->name, srcData->dims, srcData->precision,
-                                  srcData->layout)
-    );
+
+    dataPtr = InferenceEngine::DataPtr(new InferenceEngine::Data(srcData->getName(), srcData->getTensorDesc()));
     blob = ieWrapper->blob;
 }
 
@@ -324,12 +321,19 @@ void InfEngineBackendWrapper::setHostDirty()
 }
 
 
-static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr>& getSharedPlugins()
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
+static std::map<std::string, InferenceEngine::InferenceEnginePluginPtr>& getSharedPlugins()
 {
-    static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
+    static std::map<std::string, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
     return sharedPlugins;
 }
-
+#else
+static InferenceEngine::Core& getCore()
+{
+    static InferenceEngine::Core core;
+    return core;
+}
+#endif
 
 #if !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT)
 static bool detectMyriadX_()
@@ -362,24 +366,29 @@ static bool detectMyriadX_()
     InferenceEngine::CNNNetwork cnn = InferenceEngine::CNNNetwork(
                                       InferenceEngine::Builder::convertToICNNNetwork(builder.build()));
 
-    InferenceEngine::TargetDevice device = InferenceEngine::TargetDevice::eMYRIAD;
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
     InferenceEngine::InferenceEnginePluginPtr enginePtr;
     {
         AutoLock lock(getInitializationMutex());
         auto& sharedPlugins = getSharedPlugins();
-        auto pluginIt = sharedPlugins.find(device);
+        auto pluginIt = sharedPlugins.find("MYRIAD");
         if (pluginIt != sharedPlugins.end()) {
             enginePtr = pluginIt->second;
         } else {
             auto dispatcher = InferenceEngine::PluginDispatcher({""});
-            enginePtr = dispatcher.getSuitablePlugin(device);
-            sharedPlugins[device] = enginePtr;
+            enginePtr = dispatcher.getPluginByDevice("MYRIAD");
+            sharedPlugins["MYRIAD"] = enginePtr;
         }
     }
     auto plugin = InferenceEngine::InferencePlugin(enginePtr);
     try
     {
         auto netExec = plugin.LoadNetwork(cnn, {{"VPU_PLATFORM", "VPU_2480"}});
+#else
+    try
+    {
+        auto netExec = getCore().LoadNetwork(cnn, "MYRIAD", {{"VPU_PLATFORM", "VPU_2480"}});
+#endif
         auto infRequest = netExec.CreateInferRequest();
     } catch(...) {
         return false;
@@ -388,38 +397,41 @@ static bool detectMyriadX_()
 }
 #endif  // !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT)
 
-void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
+void InfEngineBackendNet::initPlugin(InferenceEngine::CNNNetwork& net)
 {
     CV_Assert(!isInitialized());
 
     try
     {
         AutoLock lock(getInitializationMutex());
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
         auto& sharedPlugins = getSharedPlugins();
-        auto pluginIt = sharedPlugins.find(targetDevice);
+        auto pluginIt = sharedPlugins.find(device_name);
         if (pluginIt != sharedPlugins.end())
         {
             enginePtr = pluginIt->second;
         }
         else
+#endif
         {
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
             auto dispatcher = InferenceEngine::PluginDispatcher({""});
-            if (targetDevice == InferenceEngine::TargetDevice::eFPGA)
+            if (device_name == "FPGA")
                 enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
             else
-                enginePtr = dispatcher.getSuitablePlugin(targetDevice);
-            sharedPlugins[targetDevice] = enginePtr;
-
+                enginePtr = dispatcher.getPluginByDevice(device_name);
+            sharedPlugins[device_name] = enginePtr;
+#else
+            isInit = true;
+#endif
             std::vector<std::string> candidates;
-
             std::string param_pluginPath = utils::getConfigurationParameterString("OPENCV_DNN_IE_EXTRA_PLUGIN_PATH", "");
             if (!param_pluginPath.empty())
             {
                 candidates.push_back(param_pluginPath);
             }
 
-            if (targetDevice == InferenceEngine::TargetDevice::eCPU ||
-                targetDevice == InferenceEngine::TargetDevice::eFPGA)
+            if (device_name == "CPU" || device_name == "FPGA")
             {
                 std::string suffixes[] = {"_avx2", "_sse4", ""};
                 bool haveFeature[] = {
@@ -449,7 +461,12 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
                 {
                     InferenceEngine::IExtensionPtr extension =
                         InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(libName);
+
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
                     enginePtr->AddExtension(extension, 0);
+#else
+                    getCore().AddExtension(extension, "CPU");
+#endif
                     CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << libName);
                     found = true;
                     break;
@@ -463,14 +480,24 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
             // Some of networks can work without a library of extra layers.
 #ifndef _WIN32
             // Limit the number of CPU threads.
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
             enginePtr->SetConfig({{
                 InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
             }}, 0);
+#else
+            if (device_name == "CPU")
+                getCore().SetConfig({{
+                    InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
+                }}, device_name);
+#endif
 #endif
         }
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
         plugin = InferenceEngine::InferencePlugin(enginePtr);
-
         netExec = plugin.LoadNetwork(net, {});
+#else
+        netExec = getCore().LoadNetwork(net, device_name);
+#endif
     }
     catch (const std::exception& ex)
     {
@@ -480,7 +507,11 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
 
 bool InfEngineBackendNet::isInitialized()
 {
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
     return (bool)enginePtr;
+#else
+    return isInit;
+#endif
 }
 
 void InfEngineBackendNet::addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs)
@@ -488,7 +519,7 @@ void InfEngineBackendNet::addBlobs(const std::vector<cv::Ptr<BackendWrapper> >&
     auto wrappers = infEngineWrappers(ptrs);
     for (const auto& wrapper : wrappers)
     {
-        std::string name = wrapper->dataPtr->name;
+        std::string name = wrapper->dataPtr->getName();
         name = name.empty() ? kDefaultInpLayerName : name;
         allBlobs.insert({name, wrapper->blob});
     }
@@ -503,7 +534,7 @@ void InfEngineBackendNet::InfEngineReqWrapper::makePromises(const std::vector<Pt
     for (int i = 0; i < outs.size(); ++i)
     {
         outs[i]->futureMat = outProms[i].getArrayResult();
-        outsNames[i] = outs[i]->dataPtr->name;
+        outsNames[i] = outs[i]->dataPtr->getName();
     }
 }
 
@@ -627,11 +658,12 @@ void InfEngineBackendNet::forward(const std::vector<Ptr<BackendWrapper> >& outBl
 Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
 {
     // NOTE: Inference Engine sizes are reversed.
-    std::vector<size_t> dims = blob->dims();
-    std::vector<int> size(dims.rbegin(), dims.rend());
+    std::vector<size_t> dims = blob->getTensorDesc().getDims();
+    std::vector<int> size(dims.begin(), dims.end());
+    auto precision = blob->getTensorDesc().getPrecision();
 
     int type = -1;
-    switch (blob->precision())
+    switch (precision)
     {
         case InferenceEngine::Precision::FP32: type = CV_32F; break;
         case InferenceEngine::Precision::U8: type = CV_8U; break;
@@ -685,7 +717,10 @@ void InfEngineBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArra
 
 InferenceEngine::Blob::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob)
 {
-    auto halfs = InferenceEngine::make_shared_blob<int16_t>(InferenceEngine::Precision::FP16, blob->layout(), blob->dims());
+    auto halfs = InferenceEngine::make_shared_blob<int16_t>({
+                     InferenceEngine::Precision::FP16, blob->getTensorDesc().getDims(),
+                     blob->getTensorDesc().getLayout()
+                 });
     halfs->allocate();
     Mat floatsData(1, blob->size(), CV_32F, blob->buffer());
     Mat halfsData(1, blob->size(), CV_16SC1, halfs->buffer());
@@ -732,7 +767,11 @@ void resetMyriadDevice()
 {
 #ifdef HAVE_INF_ENGINE
     AutoLock lock(getInitializationMutex());
-    getSharedPlugins().erase(InferenceEngine::TargetDevice::eMYRIAD);
+#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
+    getSharedPlugins().erase("MYRIAD");
+#else
+    getCore().UnregisterPlugin("MYRIAD");
+#endif
 #endif  // HAVE_INF_ENGINE
 }
 
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index 6aa9a3b407..bfff1e2bf1 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -92,18 +92,22 @@ public:
     void forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
                  bool isAsync);
 
-    void initPlugin(InferenceEngine::ICNNNetwork& net);
+    void initPlugin(InferenceEngine::CNNNetwork& net);
 
     void addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs);
 
 private:
     InferenceEngine::Builder::Network netBuilder;
 
-    InferenceEngine::InferenceEnginePluginPtr enginePtr;
-    InferenceEngine::InferencePlugin plugin;
     InferenceEngine::ExecutableNetwork netExec;
     InferenceEngine::BlobMap allBlobs;
-    InferenceEngine::TargetDevice targetDevice;
+    std::string device_name;
+#if INF_ENGINE_VER_MAJOR_LE(2019010000)
+    InferenceEngine::InferenceEnginePluginPtr enginePtr;
+    InferenceEngine::InferencePlugin plugin;
+#else
+    bool isInit = false;
+#endif
 
     struct InfEngineReqWrapper
     {
diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp
index f3cf6c9e3b..0076556854 100644
--- a/modules/dnn/test/test_ie_models.cpp
+++ b/modules/dnn/test/test_ie_models.cpp
@@ -136,13 +136,10 @@ static const std::vector<std::string> getOpenVINOTestModelsList()
 
 static inline void genData(const std::vector<size_t>& dims, Mat& m, Blob::Ptr& dataPtr)
 {
-    std::vector<int> reversedDims(dims.begin(), dims.end());
-    std::reverse(reversedDims.begin(), reversedDims.end());
-
-    m.create(reversedDims, CV_32F);
+    m.create(std::vector<int>(dims.begin(), dims.end()), CV_32F);
     randu(m, -1, 1);
 
-    dataPtr = make_shared_blob<float>(Precision::FP32, dims, (float*)m.data);
+    dataPtr = make_shared_blob<float>({Precision::FP32, dims, Layout::ANY}, (float*)m.data);
 }
 
 void runIE(Target target, const std::string& xmlPath, const std::string& binPath,
@@ -154,32 +151,42 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
 
     CNNNetwork net = reader.getNetwork();
 
+    std::string device_name;
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
+    Core ie;
+#else
     InferenceEnginePluginPtr enginePtr;
     InferencePlugin plugin;
+#endif
     ExecutableNetwork netExec;
     InferRequest infRequest;
+
     try
     {
-        auto dispatcher = InferenceEngine::PluginDispatcher({""});
         switch (target)
         {
             case DNN_TARGET_CPU:
-                enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eCPU);
+                device_name = "CPU";
                 break;
             case DNN_TARGET_OPENCL:
             case DNN_TARGET_OPENCL_FP16:
-                enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eGPU);
+                device_name = "GPU";
                 break;
             case DNN_TARGET_MYRIAD:
-                enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eMYRIAD);
+                device_name = "MYRIAD";
                 break;
             case DNN_TARGET_FPGA:
-                enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
+                device_name = "FPGA";
                 break;
             default:
                 CV_Error(Error::StsNotImplemented, "Unknown target");
         };
 
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
+        auto dispatcher = InferenceEngine::PluginDispatcher({""});
+        enginePtr = dispatcher.getPluginByDevice(device_name);
+#endif
         if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA)
         {
             std::string suffixes[] = {"_avx2", "_sse4", ""};
@@ -202,16 +209,23 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
                 try
                 {
                     IExtensionPtr extension = make_so_pointer<IExtension>(libName);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
+                    ie.AddExtension(extension, device_name);
+#else
                     enginePtr->AddExtension(extension, 0);
+#endif
                     break;
                 }
                 catch(...) {}
             }
             // Some of networks can work without a library of extra layers.
         }
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
+        netExec = ie.LoadNetwork(net, device_name);
+#else
         plugin = InferencePlugin(enginePtr);
-
         netExec = plugin.LoadNetwork(net, {});
+#endif
         infRequest = netExec.CreateInferRequest();
     }
     catch (const std::exception& ex)
@@ -224,7 +238,7 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
     BlobMap inputBlobs;
     for (auto& it : net.getInputsInfo())
     {
-        genData(it.second->getDims(), inputsMap[it.first], inputBlobs[it.first]);
+        genData(it.second->getTensorDesc().getDims(), inputsMap[it.first], inputBlobs[it.first]);
     }
     infRequest.SetInput(inputBlobs);
 
@@ -233,7 +247,7 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
     BlobMap outputBlobs;
     for (auto& it : net.getOutputsInfo())
     {
-        genData(it.second->dims, outputsMap[it.first], outputBlobs[it.first]);
+        genData(it.second->getTensorDesc().getDims(), outputsMap[it.first], outputBlobs[it.first]);
     }
     infRequest.SetOutput(outputBlobs);
 
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index 74e2c1cf40..6d45a89a05 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -469,6 +469,42 @@ INSTANTIATE_TEST_CASE_P(/**/, Async, Combine(
   Values(CV_32F, CV_8U),
   testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))
 ));
+
+typedef testing::TestWithParam<Target>  Test_Model_Optimizer;
+TEST_P(Test_Model_Optimizer, forward_two_nets)
+{
+    const int target = GetParam();
+
+    const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
+    const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
+    const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
+
+    Net net0 = readNet(model, proto);
+    net0.setPreferableTarget(target);
+
+    Net net1 = readNet(model, proto);
+    net1.setPreferableTarget(target);
+
+    // Generate inputs.
+    int blobSize[] = {2, 6, 75, 113};
+    Mat input(4, &blobSize[0], CV_32F);
+    randu(input, 0, 255);
+
+    net0.setInput(input);
+    Mat ref0 = net0.forward().clone();
+
+    net1.setInput(input);
+    Mat ref1 = net1.forward();
+
+    net0.setInput(input);
+    Mat ref2 = net0.forward();
+
+    normAssert(ref0, ref2, 0, 0);
+}
+INSTANTIATE_TEST_CASE_P(/**/, Test_Model_Optimizer,
+  testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))
+);
+
 #endif  // HAVE_INF_ENGINE
 
 }} // namespace
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 3dd0481f57..fa98e745f5 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -357,11 +357,9 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
 #if defined(INF_ENGINE_RELEASE)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
     {
-#if INF_ENGINE_VER_MAJOR_EQ(2019010000)
+#if INF_ENGINE_VER_MAJOR_GE(2019020000)
         if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
             applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#else
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
 #endif
     }
 #endif
@@ -395,16 +393,10 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
 TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
 {
     applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
-#if defined(INF_ENGINE_RELEASE)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-    {
-#if INF_ENGINE_VER_MAJOR_LE(2019010000)
-        if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
-            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
-#else
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
-#endif
-    }
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD &&
+        getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
 #endif
 
     checkBackend();
@@ -456,12 +448,13 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
     float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3;
 
 #if defined(INF_ENGINE_RELEASE)
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
-            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
-    )
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD &&
+        getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+    {
         scoreDiff = 0.061;
         iouDiff = 0.12;
         detectionConfThresh = 0.36;
+    }
 #endif
     normAssertDetections(ref, out, "", detectionConfThresh, scoreDiff, iouDiff);
     expectNoFallbacksFromIE(net);
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index ddc7f18acb..de81093d8d 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -262,7 +262,7 @@ class Test_Torch_nets : public DNNTestLayer {};
 
 TEST_P(Test_Torch_nets, OpenFace_accuracy)
 {
-#if defined(INF_ENGINE_RELEASE)
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
 #endif
@@ -287,8 +287,8 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy)
 
     // Reference output values are in range [-0.17212, 0.263492]
     // on Myriad problem layer: l4_Pooling - does not use pads_begin
-    float l1 = (target == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5;
-    float lInf = (target == DNN_TARGET_OPENCL_FP16) ? 1.5e-3 : 1e-3;
+    float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2e-3 : 1e-5;
+    float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : 1e-3;
     Mat outRef = readTorchBlob(_tf("net_openface_output.dat"), true);
     normAssert(out, outRef, "", l1, lInf);
 }
diff --git a/modules/js/src/embindgen.py b/modules/js/src/embindgen.py
index 5b5cd98b9f..5f7599668d 100644
--- a/modules/js/src/embindgen.py
+++ b/modules/js/src/embindgen.py
@@ -98,7 +98,7 @@ core = {'': ['absdiff', 'add', 'addWeighted', 'bitwise_and', 'bitwise_not', 'bit
              'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen', \
              'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude', \
              'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize', \
-             'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'setIdentity', 'setRNGSeed', \
+             'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed', \
              'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat'],
         'Algorithm': []}
 
diff --git a/modules/js/test/test_imgproc.js b/modules/js/test/test_imgproc.js
index 1f6c4c227d..673dfac549 100644
--- a/modules/js/test/test_imgproc.js
+++ b/modules/js/test/test_imgproc.js
@@ -941,4 +941,22 @@ QUnit.test('test_filter', function(assert) {
         inv3.delete();
         inv4.delete();
     }
+    //Rotate
+    {
+        let dst = new cv.Mat();
+        let src = cv.matFromArray(3, 2, cv.CV_8U, [1,2,3,4,5,6]);
+
+        cv.rotate(src, dst, cv.ROTATE_90_CLOCKWISE);
+
+        size = dst.size();
+        assert.equal(size.height, 2, "ROTATE_HEIGHT");
+        assert.equal(size.width, 3, "ROTATE_WIGTH");
+
+        let expected = new Uint8Array([5,3,1,6,4,2]);
+
+        assert.deepEqual(dst.data, expected);
+
+        dst.delete();
+        src.delete();
+    }
 });
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index e3e43bb86e..83eaba3d32 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -43,6 +43,7 @@
 #include "precomp.hpp"
 #include "cascadedetect.hpp"
 #include "opencv2/core/core_c.h"
+#include "opencv2/core/hal/intrin.hpp"
 #include "opencl_kernels_objdetect.hpp"
 
 #include <cstdio>
@@ -223,17 +224,6 @@ void HOGDescriptor::copyTo(HOGDescriptor& c) const
     c.signedGradient = signedGradient;
 }
 
-#if CV_NEON
-// replace of _mm_set_ps
-inline float32x4_t vsetq_f32(float f0, float f1, float f2, float f3)
-{
-    float32x4_t a = vdupq_n_f32(f0);
-    a = vsetq_lane_f32(f1, a, 1);
-    a = vsetq_lane_f32(f2, a, 2);
-    a = vsetq_lane_f32(f3, a, 3);
-    return a;
-}
-#endif
 void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, InputOutputArray _qangle,
     Size paddingTL, Size paddingBR) const
 {
@@ -259,38 +249,22 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
 
     Mat_<float> _lut(1, 256);
     const float* const lut = &_lut(0,0);
-#if CV_SSE2
-    const int indices[] = { 0, 1, 2, 3 };
-    __m128i idx = _mm_loadu_si128((const __m128i*)indices);
-    __m128i ifour = _mm_set1_epi32(4);
+#if CV_SIMD128
+    v_float32x4 idx(0.0f, 1.0f, 2.0f, 3.0f);
+    v_float32x4 ifour = v_setall_f32(4.0);
 
     float* const _data = &_lut(0, 0);
-    if( gammaCorrection )
-        for( i = 0; i < 256; i += 4 )
+    if ( gammaCorrection )
+        for ( i = 0; i < 256; i += 4)
         {
-            _mm_storeu_ps(_data + i, _mm_sqrt_ps(_mm_cvtepi32_ps(idx)));
-            idx = _mm_add_epi32(idx, ifour);
+            v_store(_data + i, v_sqrt(idx));
+            idx += ifour;
         }
     else
-        for( i = 0; i < 256; i += 4 )
-        {
-            _mm_storeu_ps(_data + i, _mm_cvtepi32_ps(idx));
-            idx = _mm_add_epi32(idx, ifour);
-        }
-#elif CV_NEON
-    const int indices[] = { 0, 1, 2, 3 };
-    uint32x4_t idx = *(uint32x4_t*)indices;
-    uint32x4_t ifour = vdupq_n_u32(4);
-
-    float* const _data = &_lut(0, 0);
-    if( gammaCorrection )
-        for( i = 0; i < 256; i++ )
-            _lut(0,i) = std::sqrt((float)i);
-    else
-        for( i = 0; i < 256; i += 4 )
+        for ( i = 0; i < 256; i += 4)
         {
-            vst1q_f32(_data + i, vcvtq_f32_u32(idx));
-            idx = vaddq_u32 (idx, ifour);
+            v_store(_data + i, idx);
+            idx += ifour;
         }
 #else
     if( gammaCorrection )
@@ -327,17 +301,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
     {
         int end = gradsize.width + 2;
         xmap -= 1, x = 0;
-#if CV_SSE2
+#if CV_SIMD128
         for ( ; x <= end - 4; x += 4)
         {
-            __m128i mul_res = _mm_loadu_si128((const __m128i*)(xmap + x));
-            mul_res = _mm_add_epi32(_mm_add_epi32(mul_res, mul_res), mul_res); // multiply by 3
-            _mm_storeu_si128((__m128i*)(xmap + x), mul_res);
+            v_int32x4 mul_res = v_load(xmap + x);
+            mul_res += mul_res + mul_res;
+            v_store(xmap + x, mul_res);
         }
-#elif CV_NEON
-        int32x4_t ithree = vdupq_n_s32(3);
-        for ( ; x <= end - 4; x += 4)
-            vst1q_s32(xmap + x, vmulq_s32(ithree, vld1q_s32(xmap + x)));
 #endif
         for ( ; x < end; ++x)
             xmap[x] *= 3;
@@ -368,46 +338,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
         else
         {
             x = 0;
-#if CV_SSE2
-            for( ; x <= width - 4; x += 4 )
-            {
-                int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
-                typedef const uchar* const T;
-                T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1];
-                T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x];
-                T p22 = imgPtr + xmap[x+3], p20 = p02;
-                T p32 = imgPtr + xmap[x+4], p30 = p12;
-
-                __m128 _dx0 = _mm_sub_ps(_mm_set_ps(lut[p32[0]], lut[p22[0]], lut[p12[0]], lut[p02[0]]),
-                                         _mm_set_ps(lut[p30[0]], lut[p20[0]], lut[p10[0]], lut[p00[0]]));
-                __m128 _dx1 = _mm_sub_ps(_mm_set_ps(lut[p32[1]], lut[p22[1]], lut[p12[1]], lut[p02[1]]),
-                                         _mm_set_ps(lut[p30[1]], lut[p20[1]], lut[p10[1]], lut[p00[1]]));
-                __m128 _dx2 = _mm_sub_ps(_mm_set_ps(lut[p32[2]], lut[p22[2]], lut[p12[2]], lut[p02[2]]),
-                                         _mm_set_ps(lut[p30[2]], lut[p20[2]], lut[p10[2]], lut[p00[2]]));
-
-                __m128 _dy0 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3]], lut[nextPtr[x2]], lut[nextPtr[x1]], lut[nextPtr[x0]]),
-                                         _mm_set_ps(lut[prevPtr[x3]], lut[prevPtr[x2]], lut[prevPtr[x1]], lut[prevPtr[x0]]));
-                __m128 _dy1 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+1]], lut[nextPtr[x2+1]], lut[nextPtr[x1+1]], lut[nextPtr[x0+1]]),
-                                         _mm_set_ps(lut[prevPtr[x3+1]], lut[prevPtr[x2+1]], lut[prevPtr[x1+1]], lut[prevPtr[x0+1]]));
-                __m128 _dy2 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+2]], lut[nextPtr[x2+2]], lut[nextPtr[x1+2]], lut[nextPtr[x0+2]]),
-                                         _mm_set_ps(lut[prevPtr[x3+2]], lut[prevPtr[x2+2]], lut[prevPtr[x1+2]], lut[prevPtr[x0+2]]));
-
-                __m128 _mag0 = _mm_add_ps(_mm_mul_ps(_dx0, _dx0), _mm_mul_ps(_dy0, _dy0));
-                __m128 _mag1 = _mm_add_ps(_mm_mul_ps(_dx1, _dx1), _mm_mul_ps(_dy1, _dy1));
-                __m128 _mag2 = _mm_add_ps(_mm_mul_ps(_dx2, _dx2), _mm_mul_ps(_dy2, _dy2));
-
-                __m128 mask = _mm_cmpgt_ps(_mag2, _mag1);
-                _dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx1));
-                _dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy1));
-
-                mask = _mm_cmpgt_ps(_mm_max_ps(_mag2, _mag1), _mag0);
-                _dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx0));
-                _dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy0));
-
-                _mm_storeu_ps(dbuf + x, _dx2);
-                _mm_storeu_ps(dbuf + x + width, _dy2);
-            }
-#elif CV_NEON
+#if CV_SIMD128
             for( ; x <= width - 4; x += 4 )
             {
                 int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
@@ -417,34 +348,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
                 T p22 = imgPtr + xmap[x+3], p20 = p02;
                 T p32 = imgPtr + xmap[x+4], p30 = p12;
 
-                float32x4_t _dx0 = vsubq_f32(vsetq_f32(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]),
-                                             vsetq_f32(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]));
-                float32x4_t _dx1 = vsubq_f32(vsetq_f32(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]),
-                                             vsetq_f32(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]));
-                float32x4_t _dx2 = vsubq_f32(vsetq_f32(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]),
-                                             vsetq_f32(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]));
-
-                float32x4_t _dy0 = vsubq_f32(vsetq_f32(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]),
-                                             vsetq_f32(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]));
-                float32x4_t _dy1 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]),
-                                             vsetq_f32(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]));
-                float32x4_t _dy2 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]),
-                                             vsetq_f32(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]));
-
-                float32x4_t _mag0 = vaddq_f32(vmulq_f32(_dx0, _dx0), vmulq_f32(_dy0, _dy0));
-                float32x4_t _mag1 = vaddq_f32(vmulq_f32(_dx1, _dx1), vmulq_f32(_dy1, _dy1));
-                float32x4_t _mag2 = vaddq_f32(vmulq_f32(_dx2, _dx2), vmulq_f32(_dy2, _dy2));
-
-                uint32x4_t mask = vcgtq_f32(_mag2, _mag1);
-                _dx2 = vbslq_f32(mask, _dx2, _dx1);
-                _dy2 = vbslq_f32(mask, _dy2, _dy1);
-
-                mask = vcgtq_f32(vmaxq_f32(_mag2, _mag1), _mag0);
-                _dx2 = vbslq_f32(mask, _dx2, _dx0);
-                _dy2 = vbslq_f32(mask, _dy2, _dy0);
-
-                vst1q_f32(dbuf + x, _dx2);
-                vst1q_f32(dbuf + x + width, _dy2);
+                v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) -
+                                   v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]);
+                v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) -
+                                   v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]);
+                v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) -
+                                   v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]);
+
+                v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) -
+                                   v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]);
+                v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) -
+                                   v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]);
+                v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) -
+                                   v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]);
+
+                v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
+                v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
+                v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2);
+
+                v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1);
+                _dx2 = v_select(mask, _dx2, _dx1);
+                _dy2 = v_select(mask, _dy2, _dy1);
+
+                mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0);
+                _dx2 = v_select(mask, _dx2, _dx0);
+                _dy2 = v_select(mask, _dy2, _dy0);
+
+                v_store(dbuf + x, _dx2);
+                v_store(dbuf + x + width, _dy2);
             }
 #endif
             for( ; x < width; x++ )
@@ -488,44 +419,40 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
 
         // filling the result matrix
         x = 0;
-#if CV_SSE2
-        __m128 fhalf = _mm_set1_ps(0.5f), fzero = _mm_setzero_ps();
-        __m128 _angleScale = _mm_set1_ps(angleScale), fone = _mm_set1_ps(1.0f);
-        __m128i ione = _mm_set1_epi32(1), _nbins = _mm_set1_epi32(nbins), izero = _mm_setzero_si128();
+#if CV_SIMD128
+        v_float32x4 fhalf = v_setall_f32(0.5f);
+        v_float32x4 _angleScale = v_setall_f32(angleScale), fone = v_setall_f32(1.0f);
+        v_int32x4 ione = v_setall_s32(1), _nbins = v_setall_s32(nbins), izero = v_setzero_s32();
 
         for ( ; x <= width - 4; x += 4)
         {
             int x2 = x << 1;
-            __m128 _mag = _mm_loadu_ps(dbuf + x + (width << 1));
-            __m128 _angle = _mm_loadu_ps(dbuf + x + width * 3);
-            _angle = _mm_sub_ps(_mm_mul_ps(_angleScale, _angle), fhalf);
-
-            __m128 sign = _mm_and_ps(fone, _mm_cmplt_ps(_angle, fzero));
-            __m128i _hidx = _mm_cvttps_epi32(_angle);
-            _hidx = _mm_sub_epi32(_hidx, _mm_cvtps_epi32(sign));
-            _angle = _mm_sub_ps(_angle, _mm_cvtepi32_ps(_hidx));
-
-            __m128 ft0 = _mm_mul_ps(_mag, _mm_sub_ps(fone, _angle));
-            __m128 ft1 = _mm_mul_ps(_mag, _angle);
-            __m128 ft2 = _mm_unpacklo_ps(ft0, ft1);
-            __m128 ft3 = _mm_unpackhi_ps(ft0, ft1);
-
-            _mm_storeu_ps(gradPtr + x2, ft2);
-            _mm_storeu_ps(gradPtr + x2 + 4, ft3);
-
-            __m128i mask0 = _mm_sub_epi32(izero, _mm_srli_epi32(_hidx, 31));
-            __m128i it0 = _mm_and_si128(mask0, _nbins);
-            mask0 = _mm_cmplt_epi32(_hidx, _nbins);
-            __m128i it1 = _mm_andnot_si128(mask0, _nbins);
-            _hidx = _mm_add_epi32(_hidx, _mm_sub_epi32(it0, it1));
-
-            it0 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
-            _hidx = _mm_add_epi32(ione, _hidx);
-            _hidx = _mm_and_si128(_hidx, _mm_cmplt_epi32(_hidx, _nbins));
-            it1 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
-            it0 = _mm_unpacklo_epi8(it0, it1);
-
-            _mm_storel_epi64((__m128i*)(qanglePtr + x2), it0);
+            v_float32x4 _mag = v_load(dbuf + x + (width << 1));
+            v_float32x4 _angle = v_load(dbuf + x + width * 3);
+            _angle = (_angleScale * _angle) - fhalf;
+
+            v_int32x4 _hidx = v_floor(_angle);
+            _angle -= v_cvt_f32(_hidx);
+
+            v_float32x4 ft0 = _mag * (fone - _angle);
+            v_float32x4 ft1 = _mag * _angle;
+
+            v_store_interleave(gradPtr + x2, ft0, ft1);
+
+            v_int32x4 mask0 = _hidx >> 31;
+            v_int32x4 it0 = mask0 & _nbins;
+            mask0 = (_hidx >= _nbins);
+            v_int32x4 it1 = mask0 & _nbins;
+            _hidx += (it0 - it1);
+
+            it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
+            _hidx += ione;
+            _hidx &= (_hidx < _nbins);
+            it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
+            v_uint8x16 it2, it3;
+            v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3);
+
+            v_store_low(qanglePtr + x2, it2);
         }
 #endif
         for( ; x < width; x++ )
@@ -665,31 +592,17 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
         float bh = blockSize.height * 0.5f, bw = blockSize.width * 0.5f;
 
         i = 0;
-    #if CV_SSE2
-        const int a[] = { 0, 1, 2, 3 };
-        __m128i idx = _mm_loadu_si128((__m128i*)a);
-        __m128 _bw = _mm_set1_ps(bw), _bh = _mm_set1_ps(bh);
-        __m128i ifour = _mm_set1_epi32(4);
-
-        for (; i <= blockSize.height - 4; i += 4)
-        {
-            __m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bh);
-            t = _mm_mul_ps(t, t);
-            idx = _mm_add_epi32(idx, ifour);
-            _mm_storeu_ps(_di + i, t);
-        }
-    #elif CV_NEON
-        const int a[] = { 0, 1, 2, 3 };
-        int32x4_t idx = vld1q_s32(a);
-        float32x4_t _bw = vdupq_n_f32(bw), _bh = vdupq_n_f32(bh);
-        int32x4_t ifour = vdupq_n_s32(4);
+    #if CV_SIMD128
+        v_float32x4 idx(0.0f, 1.0f, 2.0f, 3.0f);
+        v_float32x4 _bw = v_setall_f32(bw), _bh = v_setall_f32(bh);
+        v_float32x4 ifour = v_setall_f32(4.0);
 
         for (; i <= blockSize.height - 4; i += 4)
         {
-            float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bh);
-            t = vmulq_f32(t, t);
-            idx = vaddq_s32(idx, ifour);
-            vst1q_f32(_di + i, t);
+            v_float32x4 t = idx - _bh;
+            t *= t;
+            idx += ifour;
+            v_store(_di + i, t);
         }
     #endif
         for ( ; i < blockSize.height; ++i)
@@ -699,23 +612,15 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
         }
 
         j = 0;
-    #if CV_SSE2
-        idx = _mm_loadu_si128((__m128i*)a);
-        for (; j <= blockSize.width - 4; j += 4)
-        {
-            __m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bw);
-            t = _mm_mul_ps(t, t);
-            idx = _mm_add_epi32(idx, ifour);
-            _mm_storeu_ps(_dj + j, t);
-        }
-    #elif CV_NEON
-        idx = vld1q_s32(a);
-        for (; j <= blockSize.width - 4; j += 4)
+    #if CV_SIMD128
+        idx = v_float32x4(0.0f, 1.0f, 2.0f, 3.0f);
+
+        for (; j <= blockSize.height - 4; j += 4)
         {
-            float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bw);
-            t = vmulq_f32(t, t);
-            idx = vaddq_s32(idx, ifour);
-            vst1q_f32(_dj + j, t);
+            v_float32x4 t = idx - _bw;
+            t *= t;
+            idx += ifour;
+            v_store(_dj + j, t);
         }
     #endif
         for ( ; j < blockSize.width; ++j)
@@ -913,7 +818,7 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         hist[h0] = t0; hist[h1] = t1;
     }
 
-#if CV_SSE2
+#if CV_SIMD128
     float hist0[4], hist1[4];
     for( ; k < C2; k++ )
     {
@@ -922,12 +827,12 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         const uchar* const h = qanglePtr + pk.qangleOfs;
         int h0 = h[0], h1 = h[1];
 
-        __m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]);
-        __m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights));
-        __m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w);
+        v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
+        v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
+        v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
 
-        _mm_storeu_ps(hist0, _t0);
-        _mm_storeu_ps(hist1, _t1);
+        v_store(hist0, _t0);
+        v_store(hist1, _t1);
 
         float* hist = blockHist + pk.histOfs[0];
         float t0 = hist[h0] + hist0[0];
@@ -939,31 +844,6 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         t1 = hist[h1] + hist1[1];
         hist[h0] = t0; hist[h1] = t1;
     }
-#elif CV_NEON
-    float hist0[4], hist1[4];
-    for( ; k < C2; k++ )
-    {
-        const PixData& pk = _pixData[k];
-        const float* const a = gradPtr + pk.gradOfs;
-        const uchar* const h = qanglePtr + pk.qangleOfs;
-        int h0 = h[0], h1 = h[1];
-
-        float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]);
-        float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights));
-
-        float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0], (blockHist + pk.histOfs[1])[h0], 0,  0);
-        float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1], (blockHist + pk.histOfs[1])[h1], 0,  0);
-
-        float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w);
-        vst1q_f32(hist0, _t0);
-        vst1q_f32(hist1, _t1);
-
-        (blockHist + pk.histOfs[0])[h0] = hist0[0];
-        (blockHist + pk.histOfs[1])[h0] = hist0[1];
-
-        (blockHist + pk.histOfs[0])[h1] = hist1[0];
-        (blockHist + pk.histOfs[1])[h1] = hist1[1];
-    }
 #else
     for( ; k < C2; k++ )
     {
@@ -987,7 +867,7 @@ const float* HOGCache::getBlock(Point pt, float* buf)
     }
 #endif
 
-#if CV_SSE2
+#if CV_SIMD128
     for( ; k < C4; k++ )
     {
         const PixData& pk = _pixData[k];
@@ -995,12 +875,12 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         const uchar* const h = qanglePtr + pk.qangleOfs;
         int h0 = h[0], h1 = h[1];
 
-        __m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]);
-        __m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights));
-        __m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w);
+        v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
+        v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
+        v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
 
-        _mm_storeu_ps(hist0, _t0);
-        _mm_storeu_ps(hist1, _t1);
+        v_store(hist0, _t0);
+        v_store(hist1, _t1);
 
         float* hist = blockHist + pk.histOfs[0];
         float t0 = hist[h0] + hist0[0];
@@ -1021,62 +901,6 @@ const float* HOGCache::getBlock(Point pt, float* buf)
         t0 = hist[h0] + hist0[3];
         t1 = hist[h1] + hist1[3];
         hist[h0] = t0; hist[h1] = t1;
-
-//        __m128 _hist0 = _mm_set_ps((blockHist + pk.histOfs[3])[h0], (blockHist + pk.histOfs[2])[h0],
-//            (blockHist + pk.histOfs[1])[h0], (blockHist + pk.histOfs[0])[h0]);
-//        __m128 _hist1 = _mm_set_ps((blockHist + pk.histOfs[3])[h1], (blockHist + pk.histOfs[2])[h1],
-//            (blockHist + pk.histOfs[1])[h1], (blockHist + pk.histOfs[0])[h1]);
-//
-//        _hist0 = _mm_add_ps(_t0, _hist0);
-//        _hist1 = _mm_add_ps(_t1, _hist1);
-//
-//        _mm_storeu_ps(hist0, _hist0);
-//        _mm_storeu_ps(hist1, _hist1);
-//
-//        (pk.histOfs[0] + blockHist)[h0] = hist0[0];
-//        (pk.histOfs[1] + blockHist)[h0] = hist0[1];
-//        (pk.histOfs[2] + blockHist)[h0] = hist0[2];
-//        (pk.histOfs[3] + blockHist)[h0] = hist0[3];
-//
-//        (pk.histOfs[0] + blockHist)[h1] = hist1[0];
-//        (pk.histOfs[1] + blockHist)[h1] = hist1[1];
-//        (pk.histOfs[2] + blockHist)[h1] = hist1[2];
-//        (pk.histOfs[3] + blockHist)[h1] = hist1[3];
-    }
-#elif CV_NEON
-    for( ; k < C4; k++ )
-    {
-        const PixData& pk = _pixData[k];
-        const float* const a = gradPtr + pk.gradOfs;
-        const uchar* const h = qanglePtr + pk.qangleOfs;
-        int h0 = h[0], h1 = h[1];
-
-        float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]);
-        float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights));
-
-        float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0],
-                                    (blockHist + pk.histOfs[1])[h0],
-                                    (blockHist + pk.histOfs[2])[h0],
-                                    (blockHist + pk.histOfs[3])[h0]);
-        float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1],
-                                    (blockHist + pk.histOfs[1])[h1],
-                                    (blockHist + pk.histOfs[2])[h1],
-                                    (blockHist + pk.histOfs[3])[h1]);
-
-
-        float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w);
-        vst1q_f32(hist0, _t0);
-        vst1q_f32(hist1, _t1);
-
-        (blockHist + pk.histOfs[0])[h0] = hist0[0];
-        (blockHist + pk.histOfs[1])[h0] = hist0[1];
-        (blockHist + pk.histOfs[2])[h0] = hist0[2];
-        (blockHist + pk.histOfs[3])[h0] = hist0[3];
-
-        (blockHist + pk.histOfs[0])[h1] = hist1[0];
-        (blockHist + pk.histOfs[1])[h1] = hist1[1];
-        (blockHist + pk.histOfs[2])[h1] = hist1[2];
-        (blockHist + pk.histOfs[3])[h1] = hist1[3];
     }
 #else
     for( ; k < C4; k++ )
@@ -1123,26 +947,16 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
     float* hist = &_hist[0], sum = 0.0f, partSum[4];
     size_t i = 0, sz = blockHistogramSize;
 
-#if CV_SSE2
-    __m128 p0 = _mm_loadu_ps(hist);
-    __m128 s = _mm_mul_ps(p0, p0);
+#if CV_SIMD128
+    v_float32x4 p0 = v_load(hist);
+    v_float32x4 s = p0 * p0;
 
     for (i = 4; i <= sz - 4; i += 4)
     {
-        p0 = _mm_loadu_ps(hist + i);
-        s = _mm_add_ps(s, _mm_mul_ps(p0, p0));
+        p0 = v_load(hist + i);
+        s += p0 * p0;
     }
-    _mm_storeu_ps(partSum, s);
-#elif CV_NEON
-    float32x4_t p0 = vld1q_f32(hist);
-    float32x4_t s = vmulq_f32(p0, p0);
-
-    for (i = 4; i <= sz - 4; i += 4)
-    {
-        p0 = vld1q_f32(hist + i);
-        s = vaddq_f32(s, vmulq_f32(p0, p0));
-    }
-    vst1q_f32(partSum, s);
+    v_store(partSum, s);
 #else
     partSum[0] = 0.0f;
     partSum[1] = 0.0f;
@@ -1165,44 +979,25 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
     float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold;
     i = 0, sum = 0.0f;
 
-#if CV_SSE2
-    __m128 _scale = _mm_set1_ps(scale);
-    static __m128 _threshold = _mm_set1_ps(thresh);
+#if CV_SIMD128
+    v_float32x4 _scale = v_setall_f32(scale);
+    static v_float32x4 _threshold = v_setall_f32(thresh);
 
-    __m128 p = _mm_mul_ps(_scale, _mm_loadu_ps(hist));
-    p = _mm_min_ps(p, _threshold);
-    s = _mm_mul_ps(p, p);
-    _mm_storeu_ps(hist, p);
+    v_float32x4 p = _scale * v_load(hist);
+    p = v_min(p, _threshold);
+    s = p * p;
+    v_store(hist, p);
 
     for(i = 4 ; i <= sz - 4; i += 4)
     {
-        p = _mm_loadu_ps(hist + i);
-        p = _mm_mul_ps(p, _scale);
-        p = _mm_min_ps(p, _threshold);
-        s = _mm_add_ps(s, _mm_mul_ps(p, p));
-        _mm_storeu_ps(hist + i, p);
+        p = v_load(hist + i);
+        p *= _scale;
+        p = v_min(p, _threshold);
+        s += p * p;
+        v_store(hist + i, p);
     }
 
-    _mm_storeu_ps(partSum, s);
-#elif CV_NEON
-    float32x4_t _scale = vdupq_n_f32(scale);
-    static float32x4_t _threshold = vdupq_n_f32(thresh);
-
-    float32x4_t p = vmulq_f32(_scale, vld1q_f32(hist));
-    p = vminq_f32(p, _threshold);
-    s = vmulq_f32(p, p);
-    vst1q_f32(hist, p);
-
-    for(i = 4 ; i <= sz - 4; i += 4)
-    {
-        p = vld1q_f32(hist + i);
-        p = vmulq_f32(p, _scale);
-        p = vminq_f32(p, _threshold);
-        s = vaddq_f32(s, vmulq_f32(p, p));
-        vst1q_f32(hist + i, p);
-    }
-
-    vst1q_f32(partSum, s);
+    v_store(partSum, s);
 #else
     partSum[0] = 0.0f;
     partSum[1] = 0.0f;
@@ -1230,19 +1025,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
     }
 
     scale = 1.f/(std::sqrt(sum)+1e-3f), i = 0;
-#if CV_SSE2
-    __m128 _scale2 = _mm_set1_ps(scale);
-    for ( ; i <= sz - 4; i += 4)
-    {
-        __m128 t = _mm_mul_ps(_scale2, _mm_loadu_ps(hist + i));
-        _mm_storeu_ps(hist + i, t);
-    }
-#elif CV_NEON
-    float32x4_t _scale2 = vdupq_n_f32(scale);
+#if CV_SIMD128
+    v_float32x4 _scale2 = v_setall_f32(scale);
     for ( ; i <= sz - 4; i += 4)
     {
-        float32x4_t t = vmulq_f32(_scale2, vld1q_f32(hist + i));
-        vst1q_f32(hist + i, t);
+        v_float32x4 t = _scale2 * v_load(hist + i);
+        v_store(hist + i, t);
     }
 #endif
     for ( ; i < sz; ++i)
@@ -1690,7 +1478,7 @@ void HOGDescriptor::detect(InputArray _img,
     double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
     std::vector<float> blockHist(blockHistogramSize);
 
-#if CV_SSE2 || CV_NEON
+#if CV_SIMD128
     float partSum[4];
 #endif
 
@@ -1719,37 +1507,20 @@ void HOGDescriptor::detect(InputArray _img,
             Point pt = pt0 + bj.imgOffset;
 
             const float* vec = cache.getBlock(pt, &blockHist[0]);
-#if CV_SSE2
-            __m128 _vec = _mm_loadu_ps(vec);
-            __m128 _svmVec = _mm_loadu_ps(svmVec);
-            __m128 sum = _mm_mul_ps(_svmVec, _vec);
-
-            for( k = 4; k <= blockHistogramSize - 4; k += 4 )
-            {
-                _vec = _mm_loadu_ps(vec + k);
-                _svmVec = _mm_loadu_ps(svmVec + k);
-
-                sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec));
-            }
-
-            _mm_storeu_ps(partSum, sum);
-            double t0 = partSum[0] + partSum[1];
-            double t1 = partSum[2] + partSum[3];
-            s += t0 + t1;
-#elif CV_NEON
-            float32x4_t _vec = vld1q_f32(vec);
-            float32x4_t _svmVec = vld1q_f32(svmVec);
-            float32x4_t sum = vmulq_f32(_svmVec, _vec);
+#if CV_SIMD128
+            v_float32x4 _vec = v_load(vec);
+            v_float32x4 _svmVec = v_load(svmVec);
+            v_float32x4 sum = _svmVec * _vec;
 
             for( k = 4; k <= blockHistogramSize - 4; k += 4 )
             {
-                _vec = vld1q_f32(vec + k);
-                _svmVec = vld1q_f32(svmVec + k);
+                _vec = v_load(vec + k);
+                _svmVec = v_load(svmVec + k);
 
-                sum = vaddq_f32(sum, vmulq_f32(_vec, _svmVec));
+                sum += _vec * _svmVec;
             }
 
-            vst1q_f32(partSum, sum);
+            v_store(partSum, sum);
             double t0 = partSum[0] + partSum[1];
             double t1 = partSum[2] + partSum[3];
             s += t0 + t1;
@@ -3530,7 +3301,7 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
     double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
     std::vector<float> blockHist(blockHistogramSize);
 
-#if CV_SSE2 || CV_NEON
+#if CV_SIMD128
     float partSum[4];
 #endif
 
@@ -3557,37 +3328,21 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
 
             // need to divide this into 4 parts!
             const float* vec = cache.getBlock(pt, &blockHist[0]);
-#if CV_SSE2
-            __m128 _vec = _mm_loadu_ps(vec);
-            __m128 _svmVec = _mm_loadu_ps(svmVec);
-            __m128 sum = _mm_mul_ps(_svmVec, _vec);
+#if CV_SIMD128
+            v_float32x4 _vec = v_load(vec);
+            v_float32x4 _svmVec = v_load(svmVec);
+            v_float32x4 sum = _svmVec * _vec;
 
             for( k = 4; k <= blockHistogramSize - 4; k += 4 )
             {
-                _vec = _mm_loadu_ps(vec + k);
-                _svmVec = _mm_loadu_ps(svmVec + k);
+                _vec = v_load(vec + k);
+                _svmVec = v_load(svmVec + k);
 
-                sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec));
+                sum += _vec * _svmVec;
             }
 
-            _mm_storeu_ps(partSum, sum);
-            double t0 = partSum[0] + partSum[1];
-            double t1 = partSum[2] + partSum[3];
-            s += t0 + t1;
-#elif CV_NEON
-            float32x4_t _vec = vld1q_f32(vec);
-            float32x4_t _svmVec = vld1q_f32(svmVec);
-            float32x4_t sum = vmulq_f32(_svmVec, _vec);
-
-            for( k = 4; k <= blockHistogramSize - 4; k += 4 )
-            {
-                _vec = vld1q_f32(vec + k);
-                _svmVec = vld1q_f32(svmVec + k);
-
-                sum = vaddq_f32(sum, vmulq_f32(_vec, _svmVec));
-            }
+            v_store(partSum, sum);
 
-            vst1q_f32(partSum, sum);
             double t0 = partSum[0] + partSum[1];
             double t1 = partSum[2] + partSum[3];
             s += t0 + t1;
diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt
index 03d550fb81..44f35eb59b 100644
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(the_description "Images stitching")
 
 if(HAVE_CUDA)
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wstrict-aliasing)
 endif()
 
 set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d")
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 0239650470..c6d2efd665 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -499,7 +499,7 @@ struct CvCapture_FFMPEG
 
     double  r2d(AVRational r) const;
     int64_t dts_to_frame_number(int64_t dts);
-    double  dts_to_sec(int64_t dts);
+    double  dts_to_sec(int64_t dts) const;
 
     AVFormatContext * ic;
     AVCodec         * avcodec;
@@ -892,7 +892,14 @@ bool CvCapture_FFMPEG::open( const char* _filename )
 #else
     av_dict_set(&dict, "rtsp_transport", "tcp", 0);
 #endif
-    int err = avformat_open_input(&ic, _filename, NULL, &dict);
+    AVInputFormat* input_format = NULL;
+    AVDictionaryEntry* entry = av_dict_get(dict, "input_format", NULL, 0);
+    if (entry != 0)
+    {
+      input_format = av_find_input_format(entry->value);
+    }
+
+    int err = avformat_open_input(&ic, _filename, input_format, &dict);
 #else
     int err = av_open_input_file(&ic, _filename, NULL, 0, NULL);
 #endif
@@ -1168,7 +1175,11 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
     switch( property_id )
     {
     case CAP_PROP_POS_MSEC:
-        return 1000.0*(double)frame_number/get_fps();
+        if (picture_pts == AV_NOPTS_VALUE_)
+        {
+            return 0;
+        }
+        return (dts_to_sec(picture_pts) * 1000);
     case CAP_PROP_POS_FRAMES:
         return (double)frame_number;
     case CAP_PROP_POS_AVI_RATIO:
@@ -1278,7 +1289,7 @@ int64_t CvCapture_FFMPEG::dts_to_frame_number(int64_t dts)
     return (int64_t)(get_fps() * sec + 0.5);
 }
 
-double CvCapture_FFMPEG::dts_to_sec(int64_t dts)
+double CvCapture_FFMPEG::dts_to_sec(int64_t dts) const
 {
     return (double)(dts - ic->streams[video_stream]->start_time) *
         r2d(ic->streams[video_stream]->time_base);
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index 1922213454..db505b780f 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -796,11 +796,10 @@ bool CvCaptureCAM_V4L::open(int _index)
         name = cv::format("/dev/video%d", _index);
     }
 
-    /* Print the CameraNumber at the end of the string with a width of one character */
     bool res = open(name.c_str());
     if (!res)
     {
-        fprintf(stderr, "VIDEOIO ERROR: V4L: can't open camera by index %d\n", _index);
+        CV_LOG_WARNING(NULL, cv::format("VIDEOIO ERROR: V4L: can't open camera by index %d", _index));
     }
     return res;
 }
diff --git a/modules/videoio/test/test_video_io.cpp b/modules/videoio/test/test_video_io.cpp
index f6a345e04a..1330698d38 100644
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@@ -84,7 +84,7 @@ public:
     {
         if (!videoio_registry::hasBackend(apiPref))
             throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
-        if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265"))
+        if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265" || ext == "mpg"))
             throw SkipTestException("Unstable MSMF test");
         writeVideo();
         VideoCapture cap;
@@ -172,7 +172,7 @@ public:
     {
         if (!videoio_registry::hasBackend(apiPref))
             throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
-        if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265"))
+        if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265" || ext == "mpg"))
             throw SkipTestException("Unstable MSMF test");
         VideoCapture cap;
         EXPECT_NO_THROW(cap.open(video_file, apiPref));