diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index e3ae62fae7..2202cdff3b 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -123,6 +123,9 @@ if(CV_GCC OR CV_CLANG) add_extra_compiler_option(-Wsign-promo) add_extra_compiler_option(-Wuninitialized) add_extra_compiler_option(-Winit-self) + if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)) + add_extra_compiler_option(-Wno-psabi) + endif() if(HAVE_CXX11) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT ENABLE_PRECOMPILED_HEADERS) add_extra_compiler_option(-Wsuggest-override) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index a4d2c29d34..d3f78beb8e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -845,36 +845,24 @@ inline v_uint64x2 v_popcount(const v_int64x2& a) /** Mask **/ inline int v_signmask(const v_uint8x16& a) { - vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7)); - static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; - sv = vec_sl(sv, slm); - vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z); - static const vec_uint4 slm4 = {0, 0, 8, 8}; - sv4 = vec_sl(sv4, slm4); - return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3); + static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0}; + return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2); } inline int v_signmask(const v_int8x16& a) { return v_signmask(v_reinterpret_as_u8(a)); } inline int v_signmask(const v_int16x8& a) { - static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7}; - vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15)); - sv = vec_sl(sv, slm); - vec_int4 svi = vec_int4_z; - svi = vec_sums(vec_sum4s(sv, svi), svi); - return vec_extract(svi, 3); + static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128}; + return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2); } inline int v_signmask(const v_uint16x8& a) { return v_signmask(v_reinterpret_as_s16(a)); } inline int v_signmask(const v_int32x4& a) { - static const vec_uint4 slm = {0, 1, 2, 3}; - vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31)); - sv = vec_sl(sv, slm); - sv = vec_sums(sv, vec_int4_z); - return vec_extract(sv, 3); + static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; + return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2); } inline int v_signmask(const v_uint32x4& a) { return v_signmask(v_reinterpret_as_s32(a)); } diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index d78a443c2c..a459a06c5c 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -554,7 +554,9 @@ struct HWFeatures have[CV_CPU_FP16] = true; #endif #endif - + #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800) + have[CV_CPU_NEON] = true; + #endif // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs have[CV_CPU_VSX] = (CV_VSX); // TODO: Check VSX3 availability in runtime for other platforms diff --git a/modules/core/test/test_ptr.cpp b/modules/core/test/test_ptr.cpp index 885516d1b6..002bfa6c01 100644 --- a/modules/core/test/test_ptr.cpp +++ b/modules/core/test/test_ptr.cpp @@ -160,14 +160,7 @@ TEST(Core_Ptr, assignment) { Ptr p1(new Reporter(&deleted1)); -#if defined(__clang__) && (__clang_major__ >= 9) && !defined(__APPLE__) -CV_DO_PRAGMA(GCC diagnostic push) -CV_DO_PRAGMA(GCC diagnostic ignored "-Wself-assign-overloaded") -#endif - p1 = p1; -#if defined(__clang__) && (__clang_major__ >= 9) && !defined(__APPLE__) -CV_DO_PRAGMA(GCC diagnostic pop) -#endif + p1 = *&p1; EXPECT_FALSE(deleted1); } diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index f1c2eb71d6..fa6eadfb8d 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -37,7 +37,9 @@ else() -Wunused-parameter -Wsign-compare ) endif() - +if(HAVE_CUDA) + ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) +endif() if(NOT HAVE_CXX11) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-undef) # LANG_CXX11 from protobuf files endif() diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp index fa4b2f9349..c1905cb9bf 100644 --- a/modules/dnn/perf/perf_net.cpp +++ b/modules/dnn/perf/perf_net.cpp @@ -123,9 +123,12 @@ PERF_TEST_P_(DNNTestNetwork, SSD) PERF_TEST_P_(DNNTestNetwork, OpenFace) { - if (backend == DNN_BACKEND_HALIDE || - (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)) + if (backend == DNN_BACKEND_HALIDE) throw SkipTestException(""); +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) + throw SkipTestException(""); +#endif processNet("dnn/openface_nn4.small2.v1.t7", "", "", Mat(cv::Size(96, 96), CV_32FC3)); } @@ -185,16 +188,6 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow) { if (backend == DNN_BACKEND_HALIDE) throw SkipTestException(""); -#if defined(INF_ENGINE_RELEASE) - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD - && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) - throw SkipTestException("Test is disabled for MyriadX"); -#endif -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000) - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) - throw SkipTestException("Test is disabled for Myriad in OpenVINO 2019R2"); -#endif - processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "", Mat(cv::Size(300, 300), CV_32FC3)); } diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 3c0b8cff26..f2a3a7bf08 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -719,21 +719,23 @@ struct DataLayer : public Layer CV_Assert(numChannels <= 4); // Scale - auto weights = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, - {numChannels}); + InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels}, + InferenceEngine::Layout::C); + auto weights = InferenceEngine::make_shared_blob(td); weights->allocate(); - weights->set(std::vector(numChannels, scaleFactors[0])); + + float* weight_buf = weights->buffer().as(); + std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]); // Mean subtraction - auto biases = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, - {numChannels}); + auto biases = InferenceEngine::make_shared_blob(td); biases->allocate(); - std::vector biasesVec(numChannels); + float* bias_buf = biases->buffer().as(); + for (int i = 0; i < numChannels; ++i) { - biasesVec[i] = -means[0][i] * scaleFactors[0]; + bias_buf[i] = -means[0][i] * scaleFactors[0]; } - biases->set(biasesVec); InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name); addConstantData("weights", weights, ieLayer); @@ -1536,7 +1538,11 @@ struct Net::Impl for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) { InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000) dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]; +#else + dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]); +#endif } } else @@ -1544,7 +1550,11 @@ struct Net::Impl for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) { InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000) dataPtr->name = ld.name; +#else + dataPtr->setName(ld.name); +#endif } } } @@ -1565,7 +1575,11 @@ struct Net::Impl for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i) { InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]); +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000) dataPtr->name = netInputLayer->outNames[i]; +#else + dataPtr->setName(netInputLayer->outNames[i]); +#endif } } else @@ -1573,7 +1587,11 @@ struct Net::Impl for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) { InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000) dataPtr->name = ld.name; +#else + dataPtr->setName(ld.name); +#endif } } ieNode->net->addBlobs(ld.inputBlobsWrappers); diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 88654623ac..ef44ed79c4 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -111,7 +111,8 @@ public: virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); - CV_Assert(!input->dims.empty()); + std::vector dims = input->getDims(); + CV_Assert(!dims.empty()); InferenceEngine::Builder::Layer ieLayer(name); ieLayer.setName(name); @@ -122,12 +123,10 @@ public: else { ieLayer.setType("Split"); - ieLayer.getParameters()["axis"] = input->dims.size() - 1; - ieLayer.getParameters()["out_sizes"] = input->dims[0]; + ieLayer.getParameters()["axis"] = dims.size() - 1; + ieLayer.getParameters()["out_sizes"] = dims[0]; } - std::vector shape(input->dims); - std::reverse(shape.begin(), shape.end()); - ieLayer.setInputPorts({InferenceEngine::Port(shape)}); + ieLayer.setInputPorts({InferenceEngine::Port(dims)}); ieLayer.setOutputPorts(std::vector(1)); return Ptr(new InfEngineBackendNode(ieLayer)); } diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index 72baba71e6..aae9bdea1a 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -316,7 +316,7 @@ public: InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::Builder::ConcatLayer ieLayer(name); - ieLayer.setAxis(clamp(axis, input->dims.size())); + ieLayer.setAxis(clamp(axis, input->getDims().size())); ieLayer.setInputPorts(std::vector(inputs.size())); return Ptr(new InfEngineBackendNode(ieLayer)); } diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 83e881381c..42a2597af6 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -541,15 +541,14 @@ public: virtual Ptr initInfEngine(const std::vector > &inputs) CV_OVERRIDE { InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); - CV_Assert(input->dims.size() == 4 || input->dims.size() == 5); - - const int inpCn = input->dims[input->dims.size() - 2]; // NOTE: input->dims are reversed (WHIO or WHDIO) + std::vector dims = input->getDims(); + CV_Assert(dims.size() == 4 || dims.size() == 5); + const int inpCn = dims[1]; const int outCn = blobs[0].size[0]; const int inpGroupCn = blobs[0].size[1]; const int group = inpCn / inpGroupCn; - - InferenceEngine::Layout layout = (input->dims.size() == 4) ? InferenceEngine::Layout::OIHW : - InferenceEngine::Layout::NCDHW; + InferenceEngine::Layout layout = (dims.size() == 4) ? InferenceEngine::Layout::OIHW : + InferenceEngine::Layout::NCDHW; auto ieWeights = wrapToInfEngineBlob(blobs[0], layout); if (fusedWeights) @@ -561,9 +560,10 @@ public: } else { - ieWeights = InferenceEngine::make_shared_blob( - InferenceEngine::Precision::FP32, layout, - ieWeights->dims()); + ieWeights = InferenceEngine::make_shared_blob({ + InferenceEngine::Precision::FP32, + ieWeights->getTensorDesc().getDims(), layout + }); ieWeights->allocate(); Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn); @@ -1953,9 +1953,10 @@ public: auto ieWeights = wrapToInfEngineBlob(blobs[0], layout); if (fusedWeights) { - ieWeights = InferenceEngine::make_shared_blob( - InferenceEngine::Precision::FP32, layout, - ieWeights->dims()); + ieWeights = InferenceEngine::make_shared_blob({ + InferenceEngine::Precision::FP32, + ieWeights->getTensorDesc().getDims(), layout + }); ieWeights->allocate(); int inpCn = blobs[0].size[0]; diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index 09fac59078..b6b973d226 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -261,7 +261,8 @@ public: virtual Ptr initInfEngine(const std::vector >& inputs) CV_OVERRIDE { InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); - if (input->dims.size() == 4) + std::vector dims = input->getDims(); + if (dims.size() == 4) { InferenceEngine::Builder::NormalizeLayer ieLayer(name); @@ -270,13 +271,14 @@ public: ieLayer.setEpsilon(epsilon); InferenceEngine::Builder::Layer l = ieLayer; - const int numChannels = input->dims[2]; // NOTE: input->dims are reversed (whcn) + const int numChannels = dims[1]; InferenceEngine::Blob::Ptr weights; if (blobs.empty()) { - weights = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, - InferenceEngine::Layout::C, - {(size_t)numChannels}); + weights = InferenceEngine::make_shared_blob({ + InferenceEngine::Precision::FP32, + {(size_t)numChannels}, InferenceEngine::Layout::C + }); weights->allocate(); Mat weightsMat = infEngineBlobToMat(weights).reshape(1, numChannels); diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 2ec4be17be..d8bdff96cc 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -167,9 +167,11 @@ public: if (kernel_size.size() == 3) return preferableTarget == DNN_TARGET_CPU; if (preferableTarget == DNN_TARGET_MYRIAD) { +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) if (type == MAX && (pad_l == 1 && pad_t == 1) && stride == Size(2, 2) ) { return !isMyriadX(); } +#endif return type == MAX || type == AVE; } else diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index 5e22519c39..4486a0f6de 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -207,12 +207,13 @@ public: } else { - auto weights = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, - {numChannels}); + auto weights = InferenceEngine::make_shared_blob({ + InferenceEngine::Precision::FP32, {(size_t)numChannels}, + InferenceEngine::Layout::C + }); weights->allocate(); - - std::vector ones(numChannels, 1); - weights->set(ones); + float* buf = weights->buffer().as(); + std::fill(buf, buf + numChannels, 1); addConstantData("weights", weights, l); } if (hasBias) diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 7640d4637e..430555161b 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -301,14 +301,14 @@ public: { std::vector outShape(numDims); for (int i = 0; i < numDims; ++i) - outShape[numDims - 1 - i] = sliceRanges[0][i].size(); + outShape[i] = sliceRanges[0][i].size(); ieLayer.getInputPorts()[1].setParameter("type", "weights"); - // Fake blob which will be moved to inputs (as weights). - auto shapeSource = InferenceEngine::make_shared_blob( - InferenceEngine::Precision::FP32, - InferenceEngine::Layout::ANY, outShape); + auto shapeSource = InferenceEngine::make_shared_blob({ + InferenceEngine::Precision::FP32, outShape, + InferenceEngine::Layout::ANY + }); shapeSource->allocate(); addConstantData("weights", shapeSource, ieLayer); } diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 127abb20d0..59c8163492 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -329,7 +329,8 @@ public: InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::Builder::SoftMaxLayer ieLayer(name); - ieLayer.setAxis(clamp(axisRaw, input->dims.size())); + ieLayer.setAxis(clamp(axisRaw, input->getDims().size())); + return Ptr(new InfEngineBackendNode(ieLayer)); } #endif // HAVE_INF_ENGINE diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index 2635c4dc73..71e9a7b8aa 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -45,13 +45,13 @@ infEngineWrappers(const std::vector >& ptrs) InfEngineBackendNet::InfEngineBackendNet() : netBuilder("") { hasNetOwner = false; - targetDevice = InferenceEngine::TargetDevice::eCPU; + device_name = "CPU"; } InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) : netBuilder(""), cnn(net) { hasNetOwner = true; - targetDevice = InferenceEngine::TargetDevice::eCPU; + device_name = "CPU"; } void InfEngineBackendNet::connect(const std::vector >& inputs, @@ -66,16 +66,13 @@ void InfEngineBackendNet::connect(const std::vector >& input for (size_t i = 0; i < inpWrappers.size(); ++i) { const auto& inp = inpWrappers[i]; - const std::string& inpName = inp->dataPtr->name; + const std::string& inpName = inp->dataPtr->getName(); int inpId; it = layers.find(inpName); if (it == layers.end()) { InferenceEngine::Builder::InputLayer inpLayer(!inpName.empty() ? inpName : kDefaultInpLayerName); - - std::vector shape(inp->blob->dims()); - std::reverse(shape.begin(), shape.end()); - + std::vector shape(inp->blob->getTensorDesc().getDims()); inpLayer.setPort(InferenceEngine::Port(shape)); inpId = netBuilder.addLayer(inpLayer); @@ -89,7 +86,11 @@ void InfEngineBackendNet::connect(const std::vector >& input } CV_Assert(!outputs.empty()); InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]); +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) dataPtr->name = layerName; +#else + dataPtr->setName(layerName); +#endif } void InfEngineBackendNet::init(int targetId) @@ -115,21 +116,22 @@ void InfEngineBackendNet::init(int targetId) switch (targetId) { - case DNN_TARGET_CPU: - targetDevice = InferenceEngine::TargetDevice::eCPU; - break; - case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL_FP16: - targetDevice = InferenceEngine::TargetDevice::eGPU; - break; - case DNN_TARGET_MYRIAD: - targetDevice = InferenceEngine::TargetDevice::eMYRIAD; - break; - case DNN_TARGET_FPGA: - targetDevice = InferenceEngine::TargetDevice::eFPGA; - break; - default: - CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId)); - } + case DNN_TARGET_CPU: + device_name = "CPU"; + break; + case DNN_TARGET_OPENCL: + case DNN_TARGET_OPENCL_FP16: + device_name = "GPU"; + break; + case DNN_TARGET_MYRIAD: + device_name = "MYRIAD"; + break; + case DNN_TARGET_FPGA: + device_name = "FPGA"; + break; + default: + CV_Error(Error::StsNotImplemented, "Unknown target"); + }; for (const auto& name : requestedOutputs) { @@ -141,14 +143,14 @@ void InfEngineBackendNet::init(int targetId) const std::string& name = it.first; auto blobIt = allBlobs.find(name); CV_Assert(blobIt != allBlobs.end()); - it.second->setPrecision(blobIt->second->precision()); + it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision()); } for (const auto& it : cnn.getOutputsInfo()) { const std::string& name = it.first; auto blobIt = allBlobs.find(name); CV_Assert(blobIt != allBlobs.end()); - it.second->setPrecision(blobIt->second->precision()); // Should be always FP32 + it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision()); // Should be always FP32 } initPlugin(cnn); @@ -223,16 +225,13 @@ static InferenceEngine::Layout estimateLayout(const Mat& m) static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std::string& name = "") { - std::vector reversedShape(&m.size[0], &m.size[0] + m.dims); - std::reverse(reversedShape.begin(), reversedShape.end()); + std::vector shape(&m.size[0], &m.size[0] + m.dims); if (m.type() == CV_32F) - return InferenceEngine::DataPtr( - new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, estimateLayout(m)) - ); + return InferenceEngine::DataPtr(new InferenceEngine::Data(name, + {InferenceEngine::Precision::FP32, shape, estimateLayout(m)})); else if (m.type() == CV_8U) - return InferenceEngine::DataPtr( - new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::U8, estimateLayout(m)) - ); + return InferenceEngine::DataPtr(new InferenceEngine::Data(name, + {InferenceEngine::Precision::U8, shape, estimateLayout(m)})); else CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type())); } @@ -241,33 +240,33 @@ InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector(InferenceEngine::Precision::FP32, - layout, shape, (float*)m.data); + return InferenceEngine::make_shared_blob( + {InferenceEngine::Precision::FP32, shape, layout}, (float*)m.data); else if (m.type() == CV_8U) - return InferenceEngine::make_shared_blob(InferenceEngine::Precision::U8, - layout, shape, (uint8_t*)m.data); + return InferenceEngine::make_shared_blob( + {InferenceEngine::Precision::U8, shape, layout}, (uint8_t*)m.data); else CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type())); } InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout) { - std::vector reversedShape(&m.size[0], &m.size[0] + m.dims); - std::reverse(reversedShape.begin(), reversedShape.end()); - return wrapToInfEngineBlob(m, reversedShape, layout); + std::vector shape(&m.size[0], &m.size[0] + m.dims); + return wrapToInfEngineBlob(m, shape, layout); } InferenceEngine::Blob::Ptr cloneBlob(const InferenceEngine::Blob::Ptr& blob) { - InferenceEngine::Precision precision = blob->precision(); InferenceEngine::Blob::Ptr copy; + auto description = blob->getTensorDesc(); + InferenceEngine::Precision precision = description.getPrecision(); if (precision == InferenceEngine::Precision::FP32) { - copy = InferenceEngine::make_shared_blob(precision, blob->layout(), blob->dims()); + copy = InferenceEngine::make_shared_blob(description); } else if (precision == InferenceEngine::Precision::U8) { - copy = InferenceEngine::make_shared_blob(precision, blob->layout(), blob->dims()); + copy = InferenceEngine::make_shared_blob(description); } else CV_Error(Error::StsNotImplemented, "Unsupported blob precision"); @@ -296,10 +295,8 @@ InfEngineBackendWrapper::InfEngineBackendWrapper(Ptr wrapper) Ptr ieWrapper = wrapper.dynamicCast(); CV_Assert(!ieWrapper.empty()); InferenceEngine::DataPtr srcData = ieWrapper->dataPtr; - dataPtr = InferenceEngine::DataPtr( - new InferenceEngine::Data(srcData->name, srcData->dims, srcData->precision, - srcData->layout) - ); + + dataPtr = InferenceEngine::DataPtr(new InferenceEngine::Data(srcData->getName(), srcData->getTensorDesc())); blob = ieWrapper->blob; } @@ -324,12 +321,19 @@ void InfEngineBackendWrapper::setHostDirty() } -static std::map& getSharedPlugins() +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) +static std::map& getSharedPlugins() { - static std::map sharedPlugins; + static std::map sharedPlugins; return sharedPlugins; } - +#else +static InferenceEngine::Core& getCore() +{ + static InferenceEngine::Core core; + return core; +} +#endif #if !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT) static bool detectMyriadX_() @@ -362,24 +366,29 @@ static bool detectMyriadX_() InferenceEngine::CNNNetwork cnn = InferenceEngine::CNNNetwork( InferenceEngine::Builder::convertToICNNNetwork(builder.build())); - InferenceEngine::TargetDevice device = InferenceEngine::TargetDevice::eMYRIAD; +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) InferenceEngine::InferenceEnginePluginPtr enginePtr; { AutoLock lock(getInitializationMutex()); auto& sharedPlugins = getSharedPlugins(); - auto pluginIt = sharedPlugins.find(device); + auto pluginIt = sharedPlugins.find("MYRIAD"); if (pluginIt != sharedPlugins.end()) { enginePtr = pluginIt->second; } else { auto dispatcher = InferenceEngine::PluginDispatcher({""}); - enginePtr = dispatcher.getSuitablePlugin(device); - sharedPlugins[device] = enginePtr; + enginePtr = dispatcher.getPluginByDevice("MYRIAD"); + sharedPlugins["MYRIAD"] = enginePtr; } } auto plugin = InferenceEngine::InferencePlugin(enginePtr); try { auto netExec = plugin.LoadNetwork(cnn, {{"VPU_PLATFORM", "VPU_2480"}}); +#else + try + { + auto netExec = getCore().LoadNetwork(cnn, "MYRIAD", {{"VPU_PLATFORM", "VPU_2480"}}); +#endif auto infRequest = netExec.CreateInferRequest(); } catch(...) { return false; @@ -388,38 +397,41 @@ static bool detectMyriadX_() } #endif // !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT) -void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) +void InfEngineBackendNet::initPlugin(InferenceEngine::CNNNetwork& net) { CV_Assert(!isInitialized()); try { AutoLock lock(getInitializationMutex()); +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) auto& sharedPlugins = getSharedPlugins(); - auto pluginIt = sharedPlugins.find(targetDevice); + auto pluginIt = sharedPlugins.find(device_name); if (pluginIt != sharedPlugins.end()) { enginePtr = pluginIt->second; } else +#endif { +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) auto dispatcher = InferenceEngine::PluginDispatcher({""}); - if (targetDevice == InferenceEngine::TargetDevice::eFPGA) + if (device_name == "FPGA") enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU"); else - enginePtr = dispatcher.getSuitablePlugin(targetDevice); - sharedPlugins[targetDevice] = enginePtr; - + enginePtr = dispatcher.getPluginByDevice(device_name); + sharedPlugins[device_name] = enginePtr; +#else + isInit = true; +#endif std::vector candidates; - std::string param_pluginPath = utils::getConfigurationParameterString("OPENCV_DNN_IE_EXTRA_PLUGIN_PATH", ""); if (!param_pluginPath.empty()) { candidates.push_back(param_pluginPath); } - if (targetDevice == InferenceEngine::TargetDevice::eCPU || - targetDevice == InferenceEngine::TargetDevice::eFPGA) + if (device_name == "CPU" || device_name == "FPGA") { std::string suffixes[] = {"_avx2", "_sse4", ""}; bool haveFeature[] = { @@ -449,7 +461,12 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) { InferenceEngine::IExtensionPtr extension = InferenceEngine::make_so_pointer(libName); + +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) enginePtr->AddExtension(extension, 0); +#else + getCore().AddExtension(extension, "CPU"); +#endif CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << libName); found = true; break; @@ -463,14 +480,24 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) // Some of networks can work without a library of extra layers. #ifndef _WIN32 // Limit the number of CPU threads. +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) enginePtr->SetConfig({{ InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()), }}, 0); +#else + if (device_name == "CPU") + getCore().SetConfig({{ + InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()), + }}, device_name); +#endif #endif } +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) plugin = InferenceEngine::InferencePlugin(enginePtr); - netExec = plugin.LoadNetwork(net, {}); +#else + netExec = getCore().LoadNetwork(net, device_name); +#endif } catch (const std::exception& ex) { @@ -480,7 +507,11 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) bool InfEngineBackendNet::isInitialized() { +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) return (bool)enginePtr; +#else + return isInit; +#endif } void InfEngineBackendNet::addBlobs(const std::vector >& ptrs) @@ -488,7 +519,7 @@ void InfEngineBackendNet::addBlobs(const std::vector >& auto wrappers = infEngineWrappers(ptrs); for (const auto& wrapper : wrappers) { - std::string name = wrapper->dataPtr->name; + std::string name = wrapper->dataPtr->getName(); name = name.empty() ? kDefaultInpLayerName : name; allBlobs.insert({name, wrapper->blob}); } @@ -503,7 +534,7 @@ void InfEngineBackendNet::InfEngineReqWrapper::makePromises(const std::vectorfutureMat = outProms[i].getArrayResult(); - outsNames[i] = outs[i]->dataPtr->name; + outsNames[i] = outs[i]->dataPtr->getName(); } } @@ -627,11 +658,12 @@ void InfEngineBackendNet::forward(const std::vector >& outBl Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob) { // NOTE: Inference Engine sizes are reversed. - std::vector dims = blob->dims(); - std::vector size(dims.rbegin(), dims.rend()); + std::vector dims = blob->getTensorDesc().getDims(); + std::vector size(dims.begin(), dims.end()); + auto precision = blob->getTensorDesc().getPrecision(); int type = -1; - switch (blob->precision()) + switch (precision) { case InferenceEngine::Precision::FP32: type = CV_32F; break; case InferenceEngine::Precision::U8: type = CV_8U; break; @@ -685,7 +717,10 @@ void InfEngineBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArra InferenceEngine::Blob::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob) { - auto halfs = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP16, blob->layout(), blob->dims()); + auto halfs = InferenceEngine::make_shared_blob({ + InferenceEngine::Precision::FP16, blob->getTensorDesc().getDims(), + blob->getTensorDesc().getLayout() + }); halfs->allocate(); Mat floatsData(1, blob->size(), CV_32F, blob->buffer()); Mat halfsData(1, blob->size(), CV_16SC1, halfs->buffer()); @@ -732,7 +767,11 @@ void resetMyriadDevice() { #ifdef HAVE_INF_ENGINE AutoLock lock(getInitializationMutex()); - getSharedPlugins().erase(InferenceEngine::TargetDevice::eMYRIAD); +#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1) + getSharedPlugins().erase("MYRIAD"); +#else + getCore().UnregisterPlugin("MYRIAD"); +#endif #endif // HAVE_INF_ENGINE } diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp index 6aa9a3b407..bfff1e2bf1 100644 --- a/modules/dnn/src/op_inf_engine.hpp +++ b/modules/dnn/src/op_inf_engine.hpp @@ -92,18 +92,22 @@ public: void forward(const std::vector >& outBlobsWrappers, bool isAsync); - void initPlugin(InferenceEngine::ICNNNetwork& net); + void initPlugin(InferenceEngine::CNNNetwork& net); void addBlobs(const std::vector >& ptrs); private: InferenceEngine::Builder::Network netBuilder; - InferenceEngine::InferenceEnginePluginPtr enginePtr; - InferenceEngine::InferencePlugin plugin; InferenceEngine::ExecutableNetwork netExec; InferenceEngine::BlobMap allBlobs; - InferenceEngine::TargetDevice targetDevice; + std::string device_name; +#if INF_ENGINE_VER_MAJOR_LE(2019010000) + InferenceEngine::InferenceEnginePluginPtr enginePtr; + InferenceEngine::InferencePlugin plugin; +#else + bool isInit = false; +#endif struct InfEngineReqWrapper { diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp index f3cf6c9e3b..0076556854 100644 --- a/modules/dnn/test/test_ie_models.cpp +++ b/modules/dnn/test/test_ie_models.cpp @@ -136,13 +136,10 @@ static const std::vector getOpenVINOTestModelsList() static inline void genData(const std::vector& dims, Mat& m, Blob::Ptr& dataPtr) { - std::vector reversedDims(dims.begin(), dims.end()); - std::reverse(reversedDims.begin(), reversedDims.end()); - - m.create(reversedDims, CV_32F); + m.create(std::vector(dims.begin(), dims.end()), CV_32F); randu(m, -1, 1); - dataPtr = make_shared_blob(Precision::FP32, dims, (float*)m.data); + dataPtr = make_shared_blob({Precision::FP32, dims, Layout::ANY}, (float*)m.data); } void runIE(Target target, const std::string& xmlPath, const std::string& binPath, @@ -154,32 +151,42 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath CNNNetwork net = reader.getNetwork(); + std::string device_name; + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000) + Core ie; +#else InferenceEnginePluginPtr enginePtr; InferencePlugin plugin; +#endif ExecutableNetwork netExec; InferRequest infRequest; + try { - auto dispatcher = InferenceEngine::PluginDispatcher({""}); switch (target) { case DNN_TARGET_CPU: - enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eCPU); + device_name = "CPU"; break; case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL_FP16: - enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eGPU); + device_name = "GPU"; break; case DNN_TARGET_MYRIAD: - enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eMYRIAD); + device_name = "MYRIAD"; break; case DNN_TARGET_FPGA: - enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU"); + device_name = "FPGA"; break; default: CV_Error(Error::StsNotImplemented, "Unknown target"); }; +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000) + auto dispatcher = InferenceEngine::PluginDispatcher({""}); + enginePtr = dispatcher.getPluginByDevice(device_name); +#endif if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA) { std::string suffixes[] = {"_avx2", "_sse4", ""}; @@ -202,16 +209,23 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath try { IExtensionPtr extension = make_so_pointer(libName); +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000) + ie.AddExtension(extension, device_name); +#else enginePtr->AddExtension(extension, 0); +#endif break; } catch(...) {} } // Some of networks can work without a library of extra layers. } +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000) + netExec = ie.LoadNetwork(net, device_name); +#else plugin = InferencePlugin(enginePtr); - netExec = plugin.LoadNetwork(net, {}); +#endif infRequest = netExec.CreateInferRequest(); } catch (const std::exception& ex) @@ -224,7 +238,7 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath BlobMap inputBlobs; for (auto& it : net.getInputsInfo()) { - genData(it.second->getDims(), inputsMap[it.first], inputBlobs[it.first]); + genData(it.second->getTensorDesc().getDims(), inputsMap[it.first], inputBlobs[it.first]); } infRequest.SetInput(inputBlobs); @@ -233,7 +247,7 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath BlobMap outputBlobs; for (auto& it : net.getOutputsInfo()) { - genData(it.second->dims, outputsMap[it.first], outputBlobs[it.first]); + genData(it.second->getTensorDesc().getDims(), outputsMap[it.first], outputBlobs[it.first]); } infRequest.SetOutput(outputBlobs); diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp index 74e2c1cf40..6d45a89a05 100644 --- a/modules/dnn/test/test_misc.cpp +++ b/modules/dnn/test/test_misc.cpp @@ -469,6 +469,42 @@ INSTANTIATE_TEST_CASE_P(/**/, Async, Combine( Values(CV_32F, CV_8U), testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)) )); + +typedef testing::TestWithParam Test_Model_Optimizer; +TEST_P(Test_Model_Optimizer, forward_two_nets) +{ + const int target = GetParam(); + + const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : ""; + const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin"); + const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml"); + + Net net0 = readNet(model, proto); + net0.setPreferableTarget(target); + + Net net1 = readNet(model, proto); + net1.setPreferableTarget(target); + + // Generate inputs. + int blobSize[] = {2, 6, 75, 113}; + Mat input(4, &blobSize[0], CV_32F); + randu(input, 0, 255); + + net0.setInput(input); + Mat ref0 = net0.forward().clone(); + + net1.setInput(input); + Mat ref1 = net1.forward(); + + net0.setInput(input); + Mat ref2 = net0.forward(); + + normAssert(ref0, ref2, 0, 0); +} +INSTANTIATE_TEST_CASE_P(/**/, Test_Model_Optimizer, + testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)) +); + #endif // HAVE_INF_ENGINE }} // namespace diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 3dd0481f57..fa98e745f5 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -357,11 +357,9 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD) #if defined(INF_ENGINE_RELEASE) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) { -#if INF_ENGINE_VER_MAJOR_EQ(2019010000) +#if INF_ENGINE_VER_MAJOR_GE(2019020000) if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); -#else - applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); #endif } #endif @@ -395,16 +393,10 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD) TEST_P(Test_TensorFlow_nets, Inception_v2_SSD) { applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); -#if defined(INF_ENGINE_RELEASE) - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) - { -#if INF_ENGINE_VER_MAJOR_LE(2019010000) - if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) - applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); -#else - applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); -#endif - } +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && + getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); #endif checkBackend(); @@ -456,12 +448,13 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD) float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3; #if defined(INF_ENGINE_RELEASE) - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD - && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X - ) + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && + getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + { scoreDiff = 0.061; iouDiff = 0.12; detectionConfThresh = 0.36; + } #endif normAssertDetections(ref, out, "", detectionConfThresh, scoreDiff, iouDiff); expectNoFallbacksFromIE(net); diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index ddc7f18acb..de81093d8d 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -262,7 +262,7 @@ class Test_Torch_nets : public DNNTestLayer {}; TEST_P(Test_Torch_nets, OpenFace_accuracy) { -#if defined(INF_ENGINE_RELEASE) +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); #endif @@ -287,8 +287,8 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy) // Reference output values are in range [-0.17212, 0.263492] // on Myriad problem layer: l4_Pooling - does not use pads_begin - float l1 = (target == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5; - float lInf = (target == DNN_TARGET_OPENCL_FP16) ? 1.5e-3 : 1e-3; + float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2e-3 : 1e-5; + float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : 1e-3; Mat outRef = readTorchBlob(_tf("net_openface_output.dat"), true); normAssert(out, outRef, "", l1, lInf); } diff --git a/modules/js/src/embindgen.py b/modules/js/src/embindgen.py index 5b5cd98b9f..5f7599668d 100644 --- a/modules/js/src/embindgen.py +++ b/modules/js/src/embindgen.py @@ -98,7 +98,7 @@ core = {'': ['absdiff', 'add', 'addWeighted', 'bitwise_and', 'bitwise_not', 'bit 'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen', \ 'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude', \ 'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize', \ - 'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'setIdentity', 'setRNGSeed', \ + 'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed', \ 'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat'], 'Algorithm': []} diff --git a/modules/js/test/test_imgproc.js b/modules/js/test/test_imgproc.js index 1f6c4c227d..673dfac549 100644 --- a/modules/js/test/test_imgproc.js +++ b/modules/js/test/test_imgproc.js @@ -941,4 +941,22 @@ QUnit.test('test_filter', function(assert) { inv3.delete(); inv4.delete(); } + //Rotate + { + let dst = new cv.Mat(); + let src = cv.matFromArray(3, 2, cv.CV_8U, [1,2,3,4,5,6]); + + cv.rotate(src, dst, cv.ROTATE_90_CLOCKWISE); + + size = dst.size(); + assert.equal(size.height, 2, "ROTATE_HEIGHT"); + assert.equal(size.width, 3, "ROTATE_WIGTH"); + + let expected = new Uint8Array([5,3,1,6,4,2]); + + assert.deepEqual(dst.data, expected); + + dst.delete(); + src.delete(); + } }); diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp index e3e43bb86e..83eaba3d32 100644 --- a/modules/objdetect/src/hog.cpp +++ b/modules/objdetect/src/hog.cpp @@ -43,6 +43,7 @@ #include "precomp.hpp" #include "cascadedetect.hpp" #include "opencv2/core/core_c.h" +#include "opencv2/core/hal/intrin.hpp" #include "opencl_kernels_objdetect.hpp" #include @@ -223,17 +224,6 @@ void HOGDescriptor::copyTo(HOGDescriptor& c) const c.signedGradient = signedGradient; } -#if CV_NEON -// replace of _mm_set_ps -inline float32x4_t vsetq_f32(float f0, float f1, float f2, float f3) -{ - float32x4_t a = vdupq_n_f32(f0); - a = vsetq_lane_f32(f1, a, 1); - a = vsetq_lane_f32(f2, a, 2); - a = vsetq_lane_f32(f3, a, 3); - return a; -} -#endif void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, InputOutputArray _qangle, Size paddingTL, Size paddingBR) const { @@ -259,38 +249,22 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp Mat_ _lut(1, 256); const float* const lut = &_lut(0,0); -#if CV_SSE2 - const int indices[] = { 0, 1, 2, 3 }; - __m128i idx = _mm_loadu_si128((const __m128i*)indices); - __m128i ifour = _mm_set1_epi32(4); +#if CV_SIMD128 + v_float32x4 idx(0.0f, 1.0f, 2.0f, 3.0f); + v_float32x4 ifour = v_setall_f32(4.0); float* const _data = &_lut(0, 0); - if( gammaCorrection ) - for( i = 0; i < 256; i += 4 ) + if ( gammaCorrection ) + for ( i = 0; i < 256; i += 4) { - _mm_storeu_ps(_data + i, _mm_sqrt_ps(_mm_cvtepi32_ps(idx))); - idx = _mm_add_epi32(idx, ifour); + v_store(_data + i, v_sqrt(idx)); + idx += ifour; } else - for( i = 0; i < 256; i += 4 ) - { - _mm_storeu_ps(_data + i, _mm_cvtepi32_ps(idx)); - idx = _mm_add_epi32(idx, ifour); - } -#elif CV_NEON - const int indices[] = { 0, 1, 2, 3 }; - uint32x4_t idx = *(uint32x4_t*)indices; - uint32x4_t ifour = vdupq_n_u32(4); - - float* const _data = &_lut(0, 0); - if( gammaCorrection ) - for( i = 0; i < 256; i++ ) - _lut(0,i) = std::sqrt((float)i); - else - for( i = 0; i < 256; i += 4 ) + for ( i = 0; i < 256; i += 4) { - vst1q_f32(_data + i, vcvtq_f32_u32(idx)); - idx = vaddq_u32 (idx, ifour); + v_store(_data + i, idx); + idx += ifour; } #else if( gammaCorrection ) @@ -327,17 +301,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp { int end = gradsize.width + 2; xmap -= 1, x = 0; -#if CV_SSE2 +#if CV_SIMD128 for ( ; x <= end - 4; x += 4) { - __m128i mul_res = _mm_loadu_si128((const __m128i*)(xmap + x)); - mul_res = _mm_add_epi32(_mm_add_epi32(mul_res, mul_res), mul_res); // multiply by 3 - _mm_storeu_si128((__m128i*)(xmap + x), mul_res); + v_int32x4 mul_res = v_load(xmap + x); + mul_res += mul_res + mul_res; + v_store(xmap + x, mul_res); } -#elif CV_NEON - int32x4_t ithree = vdupq_n_s32(3); - for ( ; x <= end - 4; x += 4) - vst1q_s32(xmap + x, vmulq_s32(ithree, vld1q_s32(xmap + x))); #endif for ( ; x < end; ++x) xmap[x] *= 3; @@ -368,46 +338,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp else { x = 0; -#if CV_SSE2 - for( ; x <= width - 4; x += 4 ) - { - int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; - typedef const uchar* const T; - T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1]; - T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x]; - T p22 = imgPtr + xmap[x+3], p20 = p02; - T p32 = imgPtr + xmap[x+4], p30 = p12; - - __m128 _dx0 = _mm_sub_ps(_mm_set_ps(lut[p32[0]], lut[p22[0]], lut[p12[0]], lut[p02[0]]), - _mm_set_ps(lut[p30[0]], lut[p20[0]], lut[p10[0]], lut[p00[0]])); - __m128 _dx1 = _mm_sub_ps(_mm_set_ps(lut[p32[1]], lut[p22[1]], lut[p12[1]], lut[p02[1]]), - _mm_set_ps(lut[p30[1]], lut[p20[1]], lut[p10[1]], lut[p00[1]])); - __m128 _dx2 = _mm_sub_ps(_mm_set_ps(lut[p32[2]], lut[p22[2]], lut[p12[2]], lut[p02[2]]), - _mm_set_ps(lut[p30[2]], lut[p20[2]], lut[p10[2]], lut[p00[2]])); - - __m128 _dy0 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3]], lut[nextPtr[x2]], lut[nextPtr[x1]], lut[nextPtr[x0]]), - _mm_set_ps(lut[prevPtr[x3]], lut[prevPtr[x2]], lut[prevPtr[x1]], lut[prevPtr[x0]])); - __m128 _dy1 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+1]], lut[nextPtr[x2+1]], lut[nextPtr[x1+1]], lut[nextPtr[x0+1]]), - _mm_set_ps(lut[prevPtr[x3+1]], lut[prevPtr[x2+1]], lut[prevPtr[x1+1]], lut[prevPtr[x0+1]])); - __m128 _dy2 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+2]], lut[nextPtr[x2+2]], lut[nextPtr[x1+2]], lut[nextPtr[x0+2]]), - _mm_set_ps(lut[prevPtr[x3+2]], lut[prevPtr[x2+2]], lut[prevPtr[x1+2]], lut[prevPtr[x0+2]])); - - __m128 _mag0 = _mm_add_ps(_mm_mul_ps(_dx0, _dx0), _mm_mul_ps(_dy0, _dy0)); - __m128 _mag1 = _mm_add_ps(_mm_mul_ps(_dx1, _dx1), _mm_mul_ps(_dy1, _dy1)); - __m128 _mag2 = _mm_add_ps(_mm_mul_ps(_dx2, _dx2), _mm_mul_ps(_dy2, _dy2)); - - __m128 mask = _mm_cmpgt_ps(_mag2, _mag1); - _dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx1)); - _dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy1)); - - mask = _mm_cmpgt_ps(_mm_max_ps(_mag2, _mag1), _mag0); - _dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx0)); - _dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy0)); - - _mm_storeu_ps(dbuf + x, _dx2); - _mm_storeu_ps(dbuf + x + width, _dy2); - } -#elif CV_NEON +#if CV_SIMD128 for( ; x <= width - 4; x += 4 ) { int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; @@ -417,34 +348,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp T p22 = imgPtr + xmap[x+3], p20 = p02; T p32 = imgPtr + xmap[x+4], p30 = p12; - float32x4_t _dx0 = vsubq_f32(vsetq_f32(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]), - vsetq_f32(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]])); - float32x4_t _dx1 = vsubq_f32(vsetq_f32(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]), - vsetq_f32(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]])); - float32x4_t _dx2 = vsubq_f32(vsetq_f32(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]), - vsetq_f32(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]])); - - float32x4_t _dy0 = vsubq_f32(vsetq_f32(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]), - vsetq_f32(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]])); - float32x4_t _dy1 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]), - vsetq_f32(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]])); - float32x4_t _dy2 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]), - vsetq_f32(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]])); - - float32x4_t _mag0 = vaddq_f32(vmulq_f32(_dx0, _dx0), vmulq_f32(_dy0, _dy0)); - float32x4_t _mag1 = vaddq_f32(vmulq_f32(_dx1, _dx1), vmulq_f32(_dy1, _dy1)); - float32x4_t _mag2 = vaddq_f32(vmulq_f32(_dx2, _dx2), vmulq_f32(_dy2, _dy2)); - - uint32x4_t mask = vcgtq_f32(_mag2, _mag1); - _dx2 = vbslq_f32(mask, _dx2, _dx1); - _dy2 = vbslq_f32(mask, _dy2, _dy1); - - mask = vcgtq_f32(vmaxq_f32(_mag2, _mag1), _mag0); - _dx2 = vbslq_f32(mask, _dx2, _dx0); - _dy2 = vbslq_f32(mask, _dy2, _dy0); - - vst1q_f32(dbuf + x, _dx2); - vst1q_f32(dbuf + x + width, _dy2); + v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) - + v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]); + v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) - + v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]); + v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) - + v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]); + + v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) - + v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]); + v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) - + v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]); + v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) - + v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]); + + v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0); + v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1); + v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2); + + v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1); + _dx2 = v_select(mask, _dx2, _dx1); + _dy2 = v_select(mask, _dy2, _dy1); + + mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0); + _dx2 = v_select(mask, _dx2, _dx0); + _dy2 = v_select(mask, _dy2, _dy0); + + v_store(dbuf + x, _dx2); + v_store(dbuf + x + width, _dy2); } #endif for( ; x < width; x++ ) @@ -488,44 +419,40 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp // filling the result matrix x = 0; -#if CV_SSE2 - __m128 fhalf = _mm_set1_ps(0.5f), fzero = _mm_setzero_ps(); - __m128 _angleScale = _mm_set1_ps(angleScale), fone = _mm_set1_ps(1.0f); - __m128i ione = _mm_set1_epi32(1), _nbins = _mm_set1_epi32(nbins), izero = _mm_setzero_si128(); +#if CV_SIMD128 + v_float32x4 fhalf = v_setall_f32(0.5f); + v_float32x4 _angleScale = v_setall_f32(angleScale), fone = v_setall_f32(1.0f); + v_int32x4 ione = v_setall_s32(1), _nbins = v_setall_s32(nbins), izero = v_setzero_s32(); for ( ; x <= width - 4; x += 4) { int x2 = x << 1; - __m128 _mag = _mm_loadu_ps(dbuf + x + (width << 1)); - __m128 _angle = _mm_loadu_ps(dbuf + x + width * 3); - _angle = _mm_sub_ps(_mm_mul_ps(_angleScale, _angle), fhalf); - - __m128 sign = _mm_and_ps(fone, _mm_cmplt_ps(_angle, fzero)); - __m128i _hidx = _mm_cvttps_epi32(_angle); - _hidx = _mm_sub_epi32(_hidx, _mm_cvtps_epi32(sign)); - _angle = _mm_sub_ps(_angle, _mm_cvtepi32_ps(_hidx)); - - __m128 ft0 = _mm_mul_ps(_mag, _mm_sub_ps(fone, _angle)); - __m128 ft1 = _mm_mul_ps(_mag, _angle); - __m128 ft2 = _mm_unpacklo_ps(ft0, ft1); - __m128 ft3 = _mm_unpackhi_ps(ft0, ft1); - - _mm_storeu_ps(gradPtr + x2, ft2); - _mm_storeu_ps(gradPtr + x2 + 4, ft3); - - __m128i mask0 = _mm_sub_epi32(izero, _mm_srli_epi32(_hidx, 31)); - __m128i it0 = _mm_and_si128(mask0, _nbins); - mask0 = _mm_cmplt_epi32(_hidx, _nbins); - __m128i it1 = _mm_andnot_si128(mask0, _nbins); - _hidx = _mm_add_epi32(_hidx, _mm_sub_epi32(it0, it1)); - - it0 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero); - _hidx = _mm_add_epi32(ione, _hidx); - _hidx = _mm_and_si128(_hidx, _mm_cmplt_epi32(_hidx, _nbins)); - it1 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero); - it0 = _mm_unpacklo_epi8(it0, it1); - - _mm_storel_epi64((__m128i*)(qanglePtr + x2), it0); + v_float32x4 _mag = v_load(dbuf + x + (width << 1)); + v_float32x4 _angle = v_load(dbuf + x + width * 3); + _angle = (_angleScale * _angle) - fhalf; + + v_int32x4 _hidx = v_floor(_angle); + _angle -= v_cvt_f32(_hidx); + + v_float32x4 ft0 = _mag * (fone - _angle); + v_float32x4 ft1 = _mag * _angle; + + v_store_interleave(gradPtr + x2, ft0, ft1); + + v_int32x4 mask0 = _hidx >> 31; + v_int32x4 it0 = mask0 & _nbins; + mask0 = (_hidx >= _nbins); + v_int32x4 it1 = mask0 & _nbins; + _hidx += (it0 - it1); + + it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero))); + _hidx += ione; + _hidx &= (_hidx < _nbins); + it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero))); + v_uint8x16 it2, it3; + v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3); + + v_store_low(qanglePtr + x2, it2); } #endif for( ; x < width; x++ ) @@ -665,31 +592,17 @@ void HOGCache::init(const HOGDescriptor* _descriptor, float bh = blockSize.height * 0.5f, bw = blockSize.width * 0.5f; i = 0; - #if CV_SSE2 - const int a[] = { 0, 1, 2, 3 }; - __m128i idx = _mm_loadu_si128((__m128i*)a); - __m128 _bw = _mm_set1_ps(bw), _bh = _mm_set1_ps(bh); - __m128i ifour = _mm_set1_epi32(4); - - for (; i <= blockSize.height - 4; i += 4) - { - __m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bh); - t = _mm_mul_ps(t, t); - idx = _mm_add_epi32(idx, ifour); - _mm_storeu_ps(_di + i, t); - } - #elif CV_NEON - const int a[] = { 0, 1, 2, 3 }; - int32x4_t idx = vld1q_s32(a); - float32x4_t _bw = vdupq_n_f32(bw), _bh = vdupq_n_f32(bh); - int32x4_t ifour = vdupq_n_s32(4); + #if CV_SIMD128 + v_float32x4 idx(0.0f, 1.0f, 2.0f, 3.0f); + v_float32x4 _bw = v_setall_f32(bw), _bh = v_setall_f32(bh); + v_float32x4 ifour = v_setall_f32(4.0); for (; i <= blockSize.height - 4; i += 4) { - float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bh); - t = vmulq_f32(t, t); - idx = vaddq_s32(idx, ifour); - vst1q_f32(_di + i, t); + v_float32x4 t = idx - _bh; + t *= t; + idx += ifour; + v_store(_di + i, t); } #endif for ( ; i < blockSize.height; ++i) @@ -699,23 +612,15 @@ void HOGCache::init(const HOGDescriptor* _descriptor, } j = 0; - #if CV_SSE2 - idx = _mm_loadu_si128((__m128i*)a); - for (; j <= blockSize.width - 4; j += 4) - { - __m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bw); - t = _mm_mul_ps(t, t); - idx = _mm_add_epi32(idx, ifour); - _mm_storeu_ps(_dj + j, t); - } - #elif CV_NEON - idx = vld1q_s32(a); - for (; j <= blockSize.width - 4; j += 4) + #if CV_SIMD128 + idx = v_float32x4(0.0f, 1.0f, 2.0f, 3.0f); + + for (; j <= blockSize.height - 4; j += 4) { - float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bw); - t = vmulq_f32(t, t); - idx = vaddq_s32(idx, ifour); - vst1q_f32(_dj + j, t); + v_float32x4 t = idx - _bw; + t *= t; + idx += ifour; + v_store(_dj + j, t); } #endif for ( ; j < blockSize.width; ++j) @@ -913,7 +818,7 @@ const float* HOGCache::getBlock(Point pt, float* buf) hist[h0] = t0; hist[h1] = t1; } -#if CV_SSE2 +#if CV_SIMD128 float hist0[4], hist1[4]; for( ; k < C2; k++ ) { @@ -922,12 +827,12 @@ const float* HOGCache::getBlock(Point pt, float* buf) const uchar* const h = qanglePtr + pk.qangleOfs; int h0 = h[0], h1 = h[1]; - __m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]); - __m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights)); - __m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w); + v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]); + v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights); + v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w; - _mm_storeu_ps(hist0, _t0); - _mm_storeu_ps(hist1, _t1); + v_store(hist0, _t0); + v_store(hist1, _t1); float* hist = blockHist + pk.histOfs[0]; float t0 = hist[h0] + hist0[0]; @@ -939,31 +844,6 @@ const float* HOGCache::getBlock(Point pt, float* buf) t1 = hist[h1] + hist1[1]; hist[h0] = t0; hist[h1] = t1; } -#elif CV_NEON - float hist0[4], hist1[4]; - for( ; k < C2; k++ ) - { - const PixData& pk = _pixData[k]; - const float* const a = gradPtr + pk.gradOfs; - const uchar* const h = qanglePtr + pk.qangleOfs; - int h0 = h[0], h1 = h[1]; - - float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]); - float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights)); - - float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0], (blockHist + pk.histOfs[1])[h0], 0, 0); - float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1], (blockHist + pk.histOfs[1])[h1], 0, 0); - - float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w); - vst1q_f32(hist0, _t0); - vst1q_f32(hist1, _t1); - - (blockHist + pk.histOfs[0])[h0] = hist0[0]; - (blockHist + pk.histOfs[1])[h0] = hist0[1]; - - (blockHist + pk.histOfs[0])[h1] = hist1[0]; - (blockHist + pk.histOfs[1])[h1] = hist1[1]; - } #else for( ; k < C2; k++ ) { @@ -987,7 +867,7 @@ const float* HOGCache::getBlock(Point pt, float* buf) } #endif -#if CV_SSE2 +#if CV_SIMD128 for( ; k < C4; k++ ) { const PixData& pk = _pixData[k]; @@ -995,12 +875,12 @@ const float* HOGCache::getBlock(Point pt, float* buf) const uchar* const h = qanglePtr + pk.qangleOfs; int h0 = h[0], h1 = h[1]; - __m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]); - __m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights)); - __m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w); + v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]); + v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights); + v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w; - _mm_storeu_ps(hist0, _t0); - _mm_storeu_ps(hist1, _t1); + v_store(hist0, _t0); + v_store(hist1, _t1); float* hist = blockHist + pk.histOfs[0]; float t0 = hist[h0] + hist0[0]; @@ -1021,62 +901,6 @@ const float* HOGCache::getBlock(Point pt, float* buf) t0 = hist[h0] + hist0[3]; t1 = hist[h1] + hist1[3]; hist[h0] = t0; hist[h1] = t1; - -// __m128 _hist0 = _mm_set_ps((blockHist + pk.histOfs[3])[h0], (blockHist + pk.histOfs[2])[h0], -// (blockHist + pk.histOfs[1])[h0], (blockHist + pk.histOfs[0])[h0]); -// __m128 _hist1 = _mm_set_ps((blockHist + pk.histOfs[3])[h1], (blockHist + pk.histOfs[2])[h1], -// (blockHist + pk.histOfs[1])[h1], (blockHist + pk.histOfs[0])[h1]); -// -// _hist0 = _mm_add_ps(_t0, _hist0); -// _hist1 = _mm_add_ps(_t1, _hist1); -// -// _mm_storeu_ps(hist0, _hist0); -// _mm_storeu_ps(hist1, _hist1); -// -// (pk.histOfs[0] + blockHist)[h0] = hist0[0]; -// (pk.histOfs[1] + blockHist)[h0] = hist0[1]; -// (pk.histOfs[2] + blockHist)[h0] = hist0[2]; -// (pk.histOfs[3] + blockHist)[h0] = hist0[3]; -// -// (pk.histOfs[0] + blockHist)[h1] = hist1[0]; -// (pk.histOfs[1] + blockHist)[h1] = hist1[1]; -// (pk.histOfs[2] + blockHist)[h1] = hist1[2]; -// (pk.histOfs[3] + blockHist)[h1] = hist1[3]; - } -#elif CV_NEON - for( ; k < C4; k++ ) - { - const PixData& pk = _pixData[k]; - const float* const a = gradPtr + pk.gradOfs; - const uchar* const h = qanglePtr + pk.qangleOfs; - int h0 = h[0], h1 = h[1]; - - float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]); - float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights)); - - float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0], - (blockHist + pk.histOfs[1])[h0], - (blockHist + pk.histOfs[2])[h0], - (blockHist + pk.histOfs[3])[h0]); - float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1], - (blockHist + pk.histOfs[1])[h1], - (blockHist + pk.histOfs[2])[h1], - (blockHist + pk.histOfs[3])[h1]); - - - float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w); - vst1q_f32(hist0, _t0); - vst1q_f32(hist1, _t1); - - (blockHist + pk.histOfs[0])[h0] = hist0[0]; - (blockHist + pk.histOfs[1])[h0] = hist0[1]; - (blockHist + pk.histOfs[2])[h0] = hist0[2]; - (blockHist + pk.histOfs[3])[h0] = hist0[3]; - - (blockHist + pk.histOfs[0])[h1] = hist1[0]; - (blockHist + pk.histOfs[1])[h1] = hist1[1]; - (blockHist + pk.histOfs[2])[h1] = hist1[2]; - (blockHist + pk.histOfs[3])[h1] = hist1[3]; } #else for( ; k < C4; k++ ) @@ -1123,26 +947,16 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const float* hist = &_hist[0], sum = 0.0f, partSum[4]; size_t i = 0, sz = blockHistogramSize; -#if CV_SSE2 - __m128 p0 = _mm_loadu_ps(hist); - __m128 s = _mm_mul_ps(p0, p0); +#if CV_SIMD128 + v_float32x4 p0 = v_load(hist); + v_float32x4 s = p0 * p0; for (i = 4; i <= sz - 4; i += 4) { - p0 = _mm_loadu_ps(hist + i); - s = _mm_add_ps(s, _mm_mul_ps(p0, p0)); + p0 = v_load(hist + i); + s += p0 * p0; } - _mm_storeu_ps(partSum, s); -#elif CV_NEON - float32x4_t p0 = vld1q_f32(hist); - float32x4_t s = vmulq_f32(p0, p0); - - for (i = 4; i <= sz - 4; i += 4) - { - p0 = vld1q_f32(hist + i); - s = vaddq_f32(s, vmulq_f32(p0, p0)); - } - vst1q_f32(partSum, s); + v_store(partSum, s); #else partSum[0] = 0.0f; partSum[1] = 0.0f; @@ -1165,44 +979,25 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold; i = 0, sum = 0.0f; -#if CV_SSE2 - __m128 _scale = _mm_set1_ps(scale); - static __m128 _threshold = _mm_set1_ps(thresh); +#if CV_SIMD128 + v_float32x4 _scale = v_setall_f32(scale); + static v_float32x4 _threshold = v_setall_f32(thresh); - __m128 p = _mm_mul_ps(_scale, _mm_loadu_ps(hist)); - p = _mm_min_ps(p, _threshold); - s = _mm_mul_ps(p, p); - _mm_storeu_ps(hist, p); + v_float32x4 p = _scale * v_load(hist); + p = v_min(p, _threshold); + s = p * p; + v_store(hist, p); for(i = 4 ; i <= sz - 4; i += 4) { - p = _mm_loadu_ps(hist + i); - p = _mm_mul_ps(p, _scale); - p = _mm_min_ps(p, _threshold); - s = _mm_add_ps(s, _mm_mul_ps(p, p)); - _mm_storeu_ps(hist + i, p); + p = v_load(hist + i); + p *= _scale; + p = v_min(p, _threshold); + s += p * p; + v_store(hist + i, p); } - _mm_storeu_ps(partSum, s); -#elif CV_NEON - float32x4_t _scale = vdupq_n_f32(scale); - static float32x4_t _threshold = vdupq_n_f32(thresh); - - float32x4_t p = vmulq_f32(_scale, vld1q_f32(hist)); - p = vminq_f32(p, _threshold); - s = vmulq_f32(p, p); - vst1q_f32(hist, p); - - for(i = 4 ; i <= sz - 4; i += 4) - { - p = vld1q_f32(hist + i); - p = vmulq_f32(p, _scale); - p = vminq_f32(p, _threshold); - s = vaddq_f32(s, vmulq_f32(p, p)); - vst1q_f32(hist + i, p); - } - - vst1q_f32(partSum, s); + v_store(partSum, s); #else partSum[0] = 0.0f; partSum[1] = 0.0f; @@ -1230,19 +1025,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const } scale = 1.f/(std::sqrt(sum)+1e-3f), i = 0; -#if CV_SSE2 - __m128 _scale2 = _mm_set1_ps(scale); - for ( ; i <= sz - 4; i += 4) - { - __m128 t = _mm_mul_ps(_scale2, _mm_loadu_ps(hist + i)); - _mm_storeu_ps(hist + i, t); - } -#elif CV_NEON - float32x4_t _scale2 = vdupq_n_f32(scale); +#if CV_SIMD128 + v_float32x4 _scale2 = v_setall_f32(scale); for ( ; i <= sz - 4; i += 4) { - float32x4_t t = vmulq_f32(_scale2, vld1q_f32(hist + i)); - vst1q_f32(hist + i, t); + v_float32x4 t = _scale2 * v_load(hist + i); + v_store(hist + i, t); } #endif for ( ; i < sz; ++i) @@ -1690,7 +1478,7 @@ void HOGDescriptor::detect(InputArray _img, double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0; std::vector blockHist(blockHistogramSize); -#if CV_SSE2 || CV_NEON +#if CV_SIMD128 float partSum[4]; #endif @@ -1719,37 +1507,20 @@ void HOGDescriptor::detect(InputArray _img, Point pt = pt0 + bj.imgOffset; const float* vec = cache.getBlock(pt, &blockHist[0]); -#if CV_SSE2 - __m128 _vec = _mm_loadu_ps(vec); - __m128 _svmVec = _mm_loadu_ps(svmVec); - __m128 sum = _mm_mul_ps(_svmVec, _vec); - - for( k = 4; k <= blockHistogramSize - 4; k += 4 ) - { - _vec = _mm_loadu_ps(vec + k); - _svmVec = _mm_loadu_ps(svmVec + k); - - sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec)); - } - - _mm_storeu_ps(partSum, sum); - double t0 = partSum[0] + partSum[1]; - double t1 = partSum[2] + partSum[3]; - s += t0 + t1; -#elif CV_NEON - float32x4_t _vec = vld1q_f32(vec); - float32x4_t _svmVec = vld1q_f32(svmVec); - float32x4_t sum = vmulq_f32(_svmVec, _vec); +#if CV_SIMD128 + v_float32x4 _vec = v_load(vec); + v_float32x4 _svmVec = v_load(svmVec); + v_float32x4 sum = _svmVec * _vec; for( k = 4; k <= blockHistogramSize - 4; k += 4 ) { - _vec = vld1q_f32(vec + k); - _svmVec = vld1q_f32(svmVec + k); + _vec = v_load(vec + k); + _svmVec = v_load(svmVec + k); - sum = vaddq_f32(sum, vmulq_f32(_vec, _svmVec)); + sum += _vec * _svmVec; } - vst1q_f32(partSum, sum); + v_store(partSum, sum); double t0 = partSum[0] + partSum[1]; double t1 = partSum[2] + partSum[3]; s += t0 + t1; @@ -3530,7 +3301,7 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector &loc double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0; std::vector blockHist(blockHistogramSize); -#if CV_SSE2 || CV_NEON +#if CV_SIMD128 float partSum[4]; #endif @@ -3557,37 +3328,21 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector &loc // need to divide this into 4 parts! const float* vec = cache.getBlock(pt, &blockHist[0]); -#if CV_SSE2 - __m128 _vec = _mm_loadu_ps(vec); - __m128 _svmVec = _mm_loadu_ps(svmVec); - __m128 sum = _mm_mul_ps(_svmVec, _vec); +#if CV_SIMD128 + v_float32x4 _vec = v_load(vec); + v_float32x4 _svmVec = v_load(svmVec); + v_float32x4 sum = _svmVec * _vec; for( k = 4; k <= blockHistogramSize - 4; k += 4 ) { - _vec = _mm_loadu_ps(vec + k); - _svmVec = _mm_loadu_ps(svmVec + k); + _vec = v_load(vec + k); + _svmVec = v_load(svmVec + k); - sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec)); + sum += _vec * _svmVec; } - _mm_storeu_ps(partSum, sum); - double t0 = partSum[0] + partSum[1]; - double t1 = partSum[2] + partSum[3]; - s += t0 + t1; -#elif CV_NEON - float32x4_t _vec = vld1q_f32(vec); - float32x4_t _svmVec = vld1q_f32(svmVec); - float32x4_t sum = vmulq_f32(_svmVec, _vec); - - for( k = 4; k <= blockHistogramSize - 4; k += 4 ) - { - _vec = vld1q_f32(vec + k); - _svmVec = vld1q_f32(svmVec + k); - - sum = vaddq_f32(sum, vmulq_f32(_vec, _svmVec)); - } + v_store(partSum, sum); - vst1q_f32(partSum, sum); double t0 = partSum[0] + partSum[1]; double t1 = partSum[2] + partSum[3]; s += t0 + t1; diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt index 03d550fb81..44f35eb59b 100644 --- a/modules/stitching/CMakeLists.txt +++ b/modules/stitching/CMakeLists.txt @@ -1,7 +1,7 @@ set(the_description "Images stitching") if(HAVE_CUDA) - ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow) + ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wstrict-aliasing) endif() set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d") diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp index 0239650470..c6d2efd665 100644 --- a/modules/videoio/src/cap_ffmpeg_impl.hpp +++ b/modules/videoio/src/cap_ffmpeg_impl.hpp @@ -499,7 +499,7 @@ struct CvCapture_FFMPEG double r2d(AVRational r) const; int64_t dts_to_frame_number(int64_t dts); - double dts_to_sec(int64_t dts); + double dts_to_sec(int64_t dts) const; AVFormatContext * ic; AVCodec * avcodec; @@ -892,7 +892,14 @@ bool CvCapture_FFMPEG::open( const char* _filename ) #else av_dict_set(&dict, "rtsp_transport", "tcp", 0); #endif - int err = avformat_open_input(&ic, _filename, NULL, &dict); + AVInputFormat* input_format = NULL; + AVDictionaryEntry* entry = av_dict_get(dict, "input_format", NULL, 0); + if (entry != 0) + { + input_format = av_find_input_format(entry->value); + } + + int err = avformat_open_input(&ic, _filename, input_format, &dict); #else int err = av_open_input_file(&ic, _filename, NULL, 0, NULL); #endif @@ -1168,7 +1175,11 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const switch( property_id ) { case CAP_PROP_POS_MSEC: - return 1000.0*(double)frame_number/get_fps(); + if (picture_pts == AV_NOPTS_VALUE_) + { + return 0; + } + return (dts_to_sec(picture_pts) * 1000); case CAP_PROP_POS_FRAMES: return (double)frame_number; case CAP_PROP_POS_AVI_RATIO: @@ -1278,7 +1289,7 @@ int64_t CvCapture_FFMPEG::dts_to_frame_number(int64_t dts) return (int64_t)(get_fps() * sec + 0.5); } -double CvCapture_FFMPEG::dts_to_sec(int64_t dts) +double CvCapture_FFMPEG::dts_to_sec(int64_t dts) const { return (double)(dts - ic->streams[video_stream]->start_time) * r2d(ic->streams[video_stream]->time_base); diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp index 1922213454..db505b780f 100644 --- a/modules/videoio/src/cap_v4l.cpp +++ b/modules/videoio/src/cap_v4l.cpp @@ -796,11 +796,10 @@ bool CvCaptureCAM_V4L::open(int _index) name = cv::format("/dev/video%d", _index); } - /* Print the CameraNumber at the end of the string with a width of one character */ bool res = open(name.c_str()); if (!res) { - fprintf(stderr, "VIDEOIO ERROR: V4L: can't open camera by index %d\n", _index); + CV_LOG_WARNING(NULL, cv::format("VIDEOIO ERROR: V4L: can't open camera by index %d", _index)); } return res; } diff --git a/modules/videoio/test/test_video_io.cpp b/modules/videoio/test/test_video_io.cpp index f6a345e04a..1330698d38 100644 --- a/modules/videoio/test/test_video_io.cpp +++ b/modules/videoio/test/test_video_io.cpp @@ -84,7 +84,7 @@ public: { if (!videoio_registry::hasBackend(apiPref)) throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref)); - if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265")) + if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265" || ext == "mpg")) throw SkipTestException("Unstable MSMF test"); writeVideo(); VideoCapture cap; @@ -172,7 +172,7 @@ public: { if (!videoio_registry::hasBackend(apiPref)) throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref)); - if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265")) + if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265" || ext == "mpg")) throw SkipTestException("Unstable MSMF test"); VideoCapture cap; EXPECT_NO_THROW(cap.open(video_file, apiPref));