Merge remote-tracking branch 'upstream/3.4' into merge-3.4

pull/15295/head
Alexander Alekhin 6 years ago committed by Alexander Alekhin
commit 2ad0487cec
  1. 3
      cmake/OpenCVCompilerOptions.cmake
  2. 24
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp
  3. 4
      modules/core/src/system.cpp
  4. 9
      modules/core/test/test_ptr.cpp
  5. 4
      modules/dnn/CMakeLists.txt
  6. 17
      modules/dnn/perf/perf_net.cpp
  7. 34
      modules/dnn/src/dnn.cpp
  8. 11
      modules/dnn/src/layers/blank_layer.cpp
  9. 2
      modules/dnn/src/layers/concat_layer.cpp
  10. 25
      modules/dnn/src/layers/convolution_layer.cpp
  11. 12
      modules/dnn/src/layers/normalize_bbox_layer.cpp
  12. 2
      modules/dnn/src/layers/pooling_layer.cpp
  13. 11
      modules/dnn/src/layers/scale_layer.cpp
  14. 10
      modules/dnn/src/layers/slice_layer.cpp
  15. 3
      modules/dnn/src/layers/softmax_layer.cpp
  16. 179
      modules/dnn/src/op_inf_engine.cpp
  17. 12
      modules/dnn/src/op_inf_engine.hpp
  18. 40
      modules/dnn/test/test_ie_models.cpp
  19. 36
      modules/dnn/test/test_misc.cpp
  20. 25
      modules/dnn/test/test_tf_importer.cpp
  21. 6
      modules/dnn/test/test_torch_importer.cpp
  22. 2
      modules/js/src/embindgen.py
  23. 18
      modules/js/test/test_imgproc.js
  24. 531
      modules/objdetect/src/hog.cpp
  25. 2
      modules/stitching/CMakeLists.txt
  26. 19
      modules/videoio/src/cap_ffmpeg_impl.hpp
  27. 3
      modules/videoio/src/cap_v4l.cpp
  28. 4
      modules/videoio/test/test_video_io.cpp

@ -123,6 +123,9 @@ if(CV_GCC OR CV_CLANG)
add_extra_compiler_option(-Wsign-promo) add_extra_compiler_option(-Wsign-promo)
add_extra_compiler_option(-Wuninitialized) add_extra_compiler_option(-Wuninitialized)
add_extra_compiler_option(-Winit-self) add_extra_compiler_option(-Winit-self)
if(CV_GCC AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0) AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0))
add_extra_compiler_option(-Wno-psabi)
endif()
if(HAVE_CXX11) if(HAVE_CXX11)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT ENABLE_PRECOMPILED_HEADERS) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT ENABLE_PRECOMPILED_HEADERS)
add_extra_compiler_option(-Wsuggest-override) add_extra_compiler_option(-Wsuggest-override)

@ -845,36 +845,24 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
/** Mask **/ /** Mask **/
inline int v_signmask(const v_uint8x16& a) inline int v_signmask(const v_uint8x16& a)
{ {
vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7)); static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
sv = vec_sl(sv, slm);
vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
static const vec_uint4 slm4 = {0, 0, 8, 8};
sv4 = vec_sl(sv4, slm4);
return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
} }
inline int v_signmask(const v_int8x16& a) inline int v_signmask(const v_int8x16& a)
{ return v_signmask(v_reinterpret_as_u8(a)); } { return v_signmask(v_reinterpret_as_u8(a)); }
inline int v_signmask(const v_int16x8& a) inline int v_signmask(const v_int16x8& a)
{ {
static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7}; static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15)); return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
sv = vec_sl(sv, slm);
vec_int4 svi = vec_int4_z;
svi = vec_sums(vec_sum4s(sv, svi), svi);
return vec_extract(svi, 3);
} }
inline int v_signmask(const v_uint16x8& a) inline int v_signmask(const v_uint16x8& a)
{ return v_signmask(v_reinterpret_as_s16(a)); } { return v_signmask(v_reinterpret_as_s16(a)); }
inline int v_signmask(const v_int32x4& a) inline int v_signmask(const v_int32x4& a)
{ {
static const vec_uint4 slm = {0, 1, 2, 3}; static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31)); return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
sv = vec_sl(sv, slm);
sv = vec_sums(sv, vec_int4_z);
return vec_extract(sv, 3);
} }
inline int v_signmask(const v_uint32x4& a) inline int v_signmask(const v_uint32x4& a)
{ return v_signmask(v_reinterpret_as_s32(a)); } { return v_signmask(v_reinterpret_as_s32(a)); }

@ -554,7 +554,9 @@ struct HWFeatures
have[CV_CPU_FP16] = true; have[CV_CPU_FP16] = true;
#endif #endif
#endif #endif
#if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
have[CV_CPU_NEON] = true;
#endif
// there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs // there's no need to check VSX availability in runtime since it's always available on ppc64le CPUs
have[CV_CPU_VSX] = (CV_VSX); have[CV_CPU_VSX] = (CV_VSX);
// TODO: Check VSX3 availability in runtime for other platforms // TODO: Check VSX3 availability in runtime for other platforms

@ -160,14 +160,7 @@ TEST(Core_Ptr, assignment)
{ {
Ptr<Reporter> p1(new Reporter(&deleted1)); Ptr<Reporter> p1(new Reporter(&deleted1));
#if defined(__clang__) && (__clang_major__ >= 9) && !defined(__APPLE__) p1 = *&p1;
CV_DO_PRAGMA(GCC diagnostic push)
CV_DO_PRAGMA(GCC diagnostic ignored "-Wself-assign-overloaded")
#endif
p1 = p1;
#if defined(__clang__) && (__clang_major__ >= 9) && !defined(__APPLE__)
CV_DO_PRAGMA(GCC diagnostic pop)
#endif
EXPECT_FALSE(deleted1); EXPECT_FALSE(deleted1);
} }

@ -37,7 +37,9 @@ else()
-Wunused-parameter -Wsign-compare -Wunused-parameter -Wsign-compare
) )
endif() endif()
if(HAVE_CUDA)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
endif()
if(NOT HAVE_CXX11) if(NOT HAVE_CXX11)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-undef) # LANG_CXX11 from protobuf files ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-undef) # LANG_CXX11 from protobuf files
endif() endif()

@ -123,9 +123,12 @@ PERF_TEST_P_(DNNTestNetwork, SSD)
PERF_TEST_P_(DNNTestNetwork, OpenFace) PERF_TEST_P_(DNNTestNetwork, OpenFace)
{ {
if (backend == DNN_BACKEND_HALIDE || if (backend == DNN_BACKEND_HALIDE)
(backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD))
throw SkipTestException(""); throw SkipTestException("");
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
throw SkipTestException("");
#endif
processNet("dnn/openface_nn4.small2.v1.t7", "", "", processNet("dnn/openface_nn4.small2.v1.t7", "", "",
Mat(cv::Size(96, 96), CV_32FC3)); Mat(cv::Size(96, 96), CV_32FC3));
} }
@ -185,16 +188,6 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
{ {
if (backend == DNN_BACKEND_HALIDE) if (backend == DNN_BACKEND_HALIDE)
throw SkipTestException(""); throw SkipTestException("");
#if defined(INF_ENGINE_RELEASE)
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD
&& getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
throw SkipTestException("Test is disabled for MyriadX");
#endif
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019020000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
throw SkipTestException("Test is disabled for Myriad in OpenVINO 2019R2");
#endif
processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "", processNet("dnn/ssd_inception_v2_coco_2017_11_17.pb", "ssd_inception_v2_coco_2017_11_17.pbtxt", "",
Mat(cv::Size(300, 300), CV_32FC3)); Mat(cv::Size(300, 300), CV_32FC3));
} }

@ -719,21 +719,23 @@ struct DataLayer : public Layer
CV_Assert(numChannels <= 4); CV_Assert(numChannels <= 4);
// Scale // Scale
auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32, InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels},
{numChannels}); InferenceEngine::Layout::C);
auto weights = InferenceEngine::make_shared_blob<float>(td);
weights->allocate(); weights->allocate();
weights->set(std::vector<float>(numChannels, scaleFactors[0]));
float* weight_buf = weights->buffer().as<float*>();
std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]);
// Mean subtraction // Mean subtraction
auto biases = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32, auto biases = InferenceEngine::make_shared_blob<float>(td);
{numChannels});
biases->allocate(); biases->allocate();
std::vector<float> biasesVec(numChannels); float* bias_buf = biases->buffer().as<float*>();
for (int i = 0; i < numChannels; ++i) for (int i = 0; i < numChannels; ++i)
{ {
biasesVec[i] = -means[0][i] * scaleFactors[0]; bias_buf[i] = -means[0][i] * scaleFactors[0];
} }
biases->set(biasesVec);
InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name); InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
addConstantData("weights", weights, ieLayer); addConstantData("weights", weights, ieLayer);
@ -1536,7 +1538,11 @@ struct Net::Impl
for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
{ {
InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]; dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
#else
dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
#endif
} }
} }
else else
@ -1544,7 +1550,11 @@ struct Net::Impl
for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
{ {
InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
dataPtr->name = ld.name; dataPtr->name = ld.name;
#else
dataPtr->setName(ld.name);
#endif
} }
} }
} }
@ -1565,7 +1575,11 @@ struct Net::Impl
for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i) for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
{ {
InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]); InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
dataPtr->name = netInputLayer->outNames[i]; dataPtr->name = netInputLayer->outNames[i];
#else
dataPtr->setName(netInputLayer->outNames[i]);
#endif
} }
} }
else else
@ -1573,7 +1587,11 @@ struct Net::Impl
for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
{ {
InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
dataPtr->name = ld.name; dataPtr->name = ld.name;
#else
dataPtr->setName(ld.name);
#endif
} }
} }
ieNode->net->addBlobs(ld.inputBlobsWrappers); ieNode->net->addBlobs(ld.inputBlobsWrappers);

@ -111,7 +111,8 @@ public:
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
{ {
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
CV_Assert(!input->dims.empty()); std::vector<size_t> dims = input->getDims();
CV_Assert(!dims.empty());
InferenceEngine::Builder::Layer ieLayer(name); InferenceEngine::Builder::Layer ieLayer(name);
ieLayer.setName(name); ieLayer.setName(name);
@ -122,12 +123,10 @@ public:
else else
{ {
ieLayer.setType("Split"); ieLayer.setType("Split");
ieLayer.getParameters()["axis"] = input->dims.size() - 1; ieLayer.getParameters()["axis"] = dims.size() - 1;
ieLayer.getParameters()["out_sizes"] = input->dims[0]; ieLayer.getParameters()["out_sizes"] = dims[0];
} }
std::vector<size_t> shape(input->dims); ieLayer.setInputPorts({InferenceEngine::Port(dims)});
std::reverse(shape.begin(), shape.end());
ieLayer.setInputPorts({InferenceEngine::Port(shape)});
ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1)); ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer)); return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
} }

@ -316,7 +316,7 @@ public:
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
InferenceEngine::Builder::ConcatLayer ieLayer(name); InferenceEngine::Builder::ConcatLayer ieLayer(name);
ieLayer.setAxis(clamp(axis, input->dims.size())); ieLayer.setAxis(clamp(axis, input->getDims().size()));
ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size())); ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer)); return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
} }

@ -541,15 +541,14 @@ public:
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
{ {
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
CV_Assert(input->dims.size() == 4 || input->dims.size() == 5); std::vector<size_t> dims = input->getDims();
CV_Assert(dims.size() == 4 || dims.size() == 5);
const int inpCn = input->dims[input->dims.size() - 2]; // NOTE: input->dims are reversed (WHIO or WHDIO) const int inpCn = dims[1];
const int outCn = blobs[0].size[0]; const int outCn = blobs[0].size[0];
const int inpGroupCn = blobs[0].size[1]; const int inpGroupCn = blobs[0].size[1];
const int group = inpCn / inpGroupCn; const int group = inpCn / inpGroupCn;
InferenceEngine::Layout layout = (dims.size() == 4) ? InferenceEngine::Layout::OIHW :
InferenceEngine::Layout layout = (input->dims.size() == 4) ? InferenceEngine::Layout::OIHW : InferenceEngine::Layout::NCDHW;
InferenceEngine::Layout::NCDHW;
auto ieWeights = wrapToInfEngineBlob(blobs[0], layout); auto ieWeights = wrapToInfEngineBlob(blobs[0], layout);
if (fusedWeights) if (fusedWeights)
@ -561,9 +560,10 @@ public:
} }
else else
{ {
ieWeights = InferenceEngine::make_shared_blob<float>( ieWeights = InferenceEngine::make_shared_blob<float>({
InferenceEngine::Precision::FP32, layout, InferenceEngine::Precision::FP32,
ieWeights->dims()); ieWeights->getTensorDesc().getDims(), layout
});
ieWeights->allocate(); ieWeights->allocate();
Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn); Mat newWeights = infEngineBlobToMat(ieWeights).reshape(1, outCn);
@ -1953,9 +1953,10 @@ public:
auto ieWeights = wrapToInfEngineBlob(blobs[0], layout); auto ieWeights = wrapToInfEngineBlob(blobs[0], layout);
if (fusedWeights) if (fusedWeights)
{ {
ieWeights = InferenceEngine::make_shared_blob<float>( ieWeights = InferenceEngine::make_shared_blob<float>({
InferenceEngine::Precision::FP32, layout, InferenceEngine::Precision::FP32,
ieWeights->dims()); ieWeights->getTensorDesc().getDims(), layout
});
ieWeights->allocate(); ieWeights->allocate();
int inpCn = blobs[0].size[0]; int inpCn = blobs[0].size[0];

@ -261,7 +261,8 @@ public:
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
{ {
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
if (input->dims.size() == 4) std::vector<size_t> dims = input->getDims();
if (dims.size() == 4)
{ {
InferenceEngine::Builder::NormalizeLayer ieLayer(name); InferenceEngine::Builder::NormalizeLayer ieLayer(name);
@ -270,13 +271,14 @@ public:
ieLayer.setEpsilon(epsilon); ieLayer.setEpsilon(epsilon);
InferenceEngine::Builder::Layer l = ieLayer; InferenceEngine::Builder::Layer l = ieLayer;
const int numChannels = input->dims[2]; // NOTE: input->dims are reversed (whcn) const int numChannels = dims[1];
InferenceEngine::Blob::Ptr weights; InferenceEngine::Blob::Ptr weights;
if (blobs.empty()) if (blobs.empty())
{ {
weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32, weights = InferenceEngine::make_shared_blob<float>({
InferenceEngine::Layout::C, InferenceEngine::Precision::FP32,
{(size_t)numChannels}); {(size_t)numChannels}, InferenceEngine::Layout::C
});
weights->allocate(); weights->allocate();
Mat weightsMat = infEngineBlobToMat(weights).reshape(1, numChannels); Mat weightsMat = infEngineBlobToMat(weights).reshape(1, numChannels);

@ -167,9 +167,11 @@ public:
if (kernel_size.size() == 3) if (kernel_size.size() == 3)
return preferableTarget == DNN_TARGET_CPU; return preferableTarget == DNN_TARGET_CPU;
if (preferableTarget == DNN_TARGET_MYRIAD) { if (preferableTarget == DNN_TARGET_MYRIAD) {
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
if (type == MAX && (pad_l == 1 && pad_t == 1) && stride == Size(2, 2) ) { if (type == MAX && (pad_l == 1 && pad_t == 1) && stride == Size(2, 2) ) {
return !isMyriadX(); return !isMyriadX();
} }
#endif
return type == MAX || type == AVE; return type == MAX || type == AVE;
} }
else else

@ -207,12 +207,13 @@ public:
} }
else else
{ {
auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32, auto weights = InferenceEngine::make_shared_blob<float>({
{numChannels}); InferenceEngine::Precision::FP32, {(size_t)numChannels},
InferenceEngine::Layout::C
});
weights->allocate(); weights->allocate();
float* buf = weights->buffer().as<float*>();
std::vector<float> ones(numChannels, 1); std::fill(buf, buf + numChannels, 1);
weights->set(ones);
addConstantData("weights", weights, l); addConstantData("weights", weights, l);
} }
if (hasBias) if (hasBias)

@ -301,14 +301,14 @@ public:
{ {
std::vector<size_t> outShape(numDims); std::vector<size_t> outShape(numDims);
for (int i = 0; i < numDims; ++i) for (int i = 0; i < numDims; ++i)
outShape[numDims - 1 - i] = sliceRanges[0][i].size(); outShape[i] = sliceRanges[0][i].size();
ieLayer.getInputPorts()[1].setParameter("type", "weights"); ieLayer.getInputPorts()[1].setParameter("type", "weights");
// Fake blob which will be moved to inputs (as weights). auto shapeSource = InferenceEngine::make_shared_blob<float>({
auto shapeSource = InferenceEngine::make_shared_blob<float>( InferenceEngine::Precision::FP32, outShape,
InferenceEngine::Precision::FP32, InferenceEngine::Layout::ANY
InferenceEngine::Layout::ANY, outShape); });
shapeSource->allocate(); shapeSource->allocate();
addConstantData("weights", shapeSource, ieLayer); addConstantData("weights", shapeSource, ieLayer);
} }

@ -329,7 +329,8 @@ public:
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]); InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
InferenceEngine::Builder::SoftMaxLayer ieLayer(name); InferenceEngine::Builder::SoftMaxLayer ieLayer(name);
ieLayer.setAxis(clamp(axisRaw, input->dims.size())); ieLayer.setAxis(clamp(axisRaw, input->getDims().size()));
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer)); return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
} }
#endif // HAVE_INF_ENGINE #endif // HAVE_INF_ENGINE

@ -45,13 +45,13 @@ infEngineWrappers(const std::vector<Ptr<BackendWrapper> >& ptrs)
InfEngineBackendNet::InfEngineBackendNet() : netBuilder("") InfEngineBackendNet::InfEngineBackendNet() : netBuilder("")
{ {
hasNetOwner = false; hasNetOwner = false;
targetDevice = InferenceEngine::TargetDevice::eCPU; device_name = "CPU";
} }
InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) : netBuilder(""), cnn(net) InfEngineBackendNet::InfEngineBackendNet(InferenceEngine::CNNNetwork& net) : netBuilder(""), cnn(net)
{ {
hasNetOwner = true; hasNetOwner = true;
targetDevice = InferenceEngine::TargetDevice::eCPU; device_name = "CPU";
} }
void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& inputs, void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& inputs,
@ -66,16 +66,13 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
for (size_t i = 0; i < inpWrappers.size(); ++i) for (size_t i = 0; i < inpWrappers.size(); ++i)
{ {
const auto& inp = inpWrappers[i]; const auto& inp = inpWrappers[i];
const std::string& inpName = inp->dataPtr->name; const std::string& inpName = inp->dataPtr->getName();
int inpId; int inpId;
it = layers.find(inpName); it = layers.find(inpName);
if (it == layers.end()) if (it == layers.end())
{ {
InferenceEngine::Builder::InputLayer inpLayer(!inpName.empty() ? inpName : kDefaultInpLayerName); InferenceEngine::Builder::InputLayer inpLayer(!inpName.empty() ? inpName : kDefaultInpLayerName);
std::vector<size_t> shape(inp->blob->getTensorDesc().getDims());
std::vector<size_t> shape(inp->blob->dims());
std::reverse(shape.begin(), shape.end());
inpLayer.setPort(InferenceEngine::Port(shape)); inpLayer.setPort(InferenceEngine::Port(shape));
inpId = netBuilder.addLayer(inpLayer); inpId = netBuilder.addLayer(inpLayer);
@ -89,7 +86,11 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
} }
CV_Assert(!outputs.empty()); CV_Assert(!outputs.empty());
InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]); InferenceEngine::DataPtr dataPtr = infEngineDataNode(outputs[0]);
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
dataPtr->name = layerName; dataPtr->name = layerName;
#else
dataPtr->setName(layerName);
#endif
} }
void InfEngineBackendNet::init(int targetId) void InfEngineBackendNet::init(int targetId)
@ -115,21 +116,22 @@ void InfEngineBackendNet::init(int targetId)
switch (targetId) switch (targetId)
{ {
case DNN_TARGET_CPU: case DNN_TARGET_CPU:
targetDevice = InferenceEngine::TargetDevice::eCPU; device_name = "CPU";
break; break;
case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL_FP16: case DNN_TARGET_OPENCL:
targetDevice = InferenceEngine::TargetDevice::eGPU; case DNN_TARGET_OPENCL_FP16:
break; device_name = "GPU";
case DNN_TARGET_MYRIAD: break;
targetDevice = InferenceEngine::TargetDevice::eMYRIAD; case DNN_TARGET_MYRIAD:
break; device_name = "MYRIAD";
case DNN_TARGET_FPGA: break;
targetDevice = InferenceEngine::TargetDevice::eFPGA; case DNN_TARGET_FPGA:
break; device_name = "FPGA";
default: break;
CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId)); default:
} CV_Error(Error::StsNotImplemented, "Unknown target");
};
for (const auto& name : requestedOutputs) for (const auto& name : requestedOutputs)
{ {
@ -141,14 +143,14 @@ void InfEngineBackendNet::init(int targetId)
const std::string& name = it.first; const std::string& name = it.first;
auto blobIt = allBlobs.find(name); auto blobIt = allBlobs.find(name);
CV_Assert(blobIt != allBlobs.end()); CV_Assert(blobIt != allBlobs.end());
it.second->setPrecision(blobIt->second->precision()); it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision());
} }
for (const auto& it : cnn.getOutputsInfo()) for (const auto& it : cnn.getOutputsInfo())
{ {
const std::string& name = it.first; const std::string& name = it.first;
auto blobIt = allBlobs.find(name); auto blobIt = allBlobs.find(name);
CV_Assert(blobIt != allBlobs.end()); CV_Assert(blobIt != allBlobs.end());
it.second->setPrecision(blobIt->second->precision()); // Should be always FP32 it.second->setPrecision(blobIt->second->getTensorDesc().getPrecision()); // Should be always FP32
} }
initPlugin(cnn); initPlugin(cnn);
@ -223,16 +225,13 @@ static InferenceEngine::Layout estimateLayout(const Mat& m)
static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std::string& name = "") static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std::string& name = "")
{ {
std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims); std::vector<size_t> shape(&m.size[0], &m.size[0] + m.dims);
std::reverse(reversedShape.begin(), reversedShape.end());
if (m.type() == CV_32F) if (m.type() == CV_32F)
return InferenceEngine::DataPtr( return InferenceEngine::DataPtr(new InferenceEngine::Data(name,
new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, estimateLayout(m)) {InferenceEngine::Precision::FP32, shape, estimateLayout(m)}));
);
else if (m.type() == CV_8U) else if (m.type() == CV_8U)
return InferenceEngine::DataPtr( return InferenceEngine::DataPtr(new InferenceEngine::Data(name,
new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::U8, estimateLayout(m)) {InferenceEngine::Precision::U8, shape, estimateLayout(m)}));
);
else else
CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type())); CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type()));
} }
@ -241,33 +240,33 @@ InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector<s
InferenceEngine::Layout layout) InferenceEngine::Layout layout)
{ {
if (m.type() == CV_32F) if (m.type() == CV_32F)
return InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32, return InferenceEngine::make_shared_blob<float>(
layout, shape, (float*)m.data); {InferenceEngine::Precision::FP32, shape, layout}, (float*)m.data);
else if (m.type() == CV_8U) else if (m.type() == CV_8U)
return InferenceEngine::make_shared_blob<uint8_t>(InferenceEngine::Precision::U8, return InferenceEngine::make_shared_blob<uint8_t>(
layout, shape, (uint8_t*)m.data); {InferenceEngine::Precision::U8, shape, layout}, (uint8_t*)m.data);
else else
CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type())); CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type()));
} }
InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout) InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout)
{ {
std::vector<size_t> reversedShape(&m.size[0], &m.size[0] + m.dims); std::vector<size_t> shape(&m.size[0], &m.size[0] + m.dims);
std::reverse(reversedShape.begin(), reversedShape.end()); return wrapToInfEngineBlob(m, shape, layout);
return wrapToInfEngineBlob(m, reversedShape, layout);
} }
InferenceEngine::Blob::Ptr cloneBlob(const InferenceEngine::Blob::Ptr& blob) InferenceEngine::Blob::Ptr cloneBlob(const InferenceEngine::Blob::Ptr& blob)
{ {
InferenceEngine::Precision precision = blob->precision();
InferenceEngine::Blob::Ptr copy; InferenceEngine::Blob::Ptr copy;
auto description = blob->getTensorDesc();
InferenceEngine::Precision precision = description.getPrecision();
if (precision == InferenceEngine::Precision::FP32) if (precision == InferenceEngine::Precision::FP32)
{ {
copy = InferenceEngine::make_shared_blob<float>(precision, blob->layout(), blob->dims()); copy = InferenceEngine::make_shared_blob<float>(description);
} }
else if (precision == InferenceEngine::Precision::U8) else if (precision == InferenceEngine::Precision::U8)
{ {
copy = InferenceEngine::make_shared_blob<uint8_t>(precision, blob->layout(), blob->dims()); copy = InferenceEngine::make_shared_blob<uint8_t>(description);
} }
else else
CV_Error(Error::StsNotImplemented, "Unsupported blob precision"); CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
@ -296,10 +295,8 @@ InfEngineBackendWrapper::InfEngineBackendWrapper(Ptr<BackendWrapper> wrapper)
Ptr<InfEngineBackendWrapper> ieWrapper = wrapper.dynamicCast<InfEngineBackendWrapper>(); Ptr<InfEngineBackendWrapper> ieWrapper = wrapper.dynamicCast<InfEngineBackendWrapper>();
CV_Assert(!ieWrapper.empty()); CV_Assert(!ieWrapper.empty());
InferenceEngine::DataPtr srcData = ieWrapper->dataPtr; InferenceEngine::DataPtr srcData = ieWrapper->dataPtr;
dataPtr = InferenceEngine::DataPtr(
new InferenceEngine::Data(srcData->name, srcData->dims, srcData->precision, dataPtr = InferenceEngine::DataPtr(new InferenceEngine::Data(srcData->getName(), srcData->getTensorDesc()));
srcData->layout)
);
blob = ieWrapper->blob; blob = ieWrapper->blob;
} }
@ -324,12 +321,19 @@ void InfEngineBackendWrapper::setHostDirty()
} }
static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr>& getSharedPlugins() #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
static std::map<std::string, InferenceEngine::InferenceEnginePluginPtr>& getSharedPlugins()
{ {
static std::map<InferenceEngine::TargetDevice, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins; static std::map<std::string, InferenceEngine::InferenceEnginePluginPtr> sharedPlugins;
return sharedPlugins; return sharedPlugins;
} }
#else
static InferenceEngine::Core& getCore()
{
static InferenceEngine::Core core;
return core;
}
#endif
#if !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT) #if !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT)
static bool detectMyriadX_() static bool detectMyriadX_()
@ -362,24 +366,29 @@ static bool detectMyriadX_()
InferenceEngine::CNNNetwork cnn = InferenceEngine::CNNNetwork( InferenceEngine::CNNNetwork cnn = InferenceEngine::CNNNetwork(
InferenceEngine::Builder::convertToICNNNetwork(builder.build())); InferenceEngine::Builder::convertToICNNNetwork(builder.build()));
InferenceEngine::TargetDevice device = InferenceEngine::TargetDevice::eMYRIAD; #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
InferenceEngine::InferenceEnginePluginPtr enginePtr; InferenceEngine::InferenceEnginePluginPtr enginePtr;
{ {
AutoLock lock(getInitializationMutex()); AutoLock lock(getInitializationMutex());
auto& sharedPlugins = getSharedPlugins(); auto& sharedPlugins = getSharedPlugins();
auto pluginIt = sharedPlugins.find(device); auto pluginIt = sharedPlugins.find("MYRIAD");
if (pluginIt != sharedPlugins.end()) { if (pluginIt != sharedPlugins.end()) {
enginePtr = pluginIt->second; enginePtr = pluginIt->second;
} else { } else {
auto dispatcher = InferenceEngine::PluginDispatcher({""}); auto dispatcher = InferenceEngine::PluginDispatcher({""});
enginePtr = dispatcher.getSuitablePlugin(device); enginePtr = dispatcher.getPluginByDevice("MYRIAD");
sharedPlugins[device] = enginePtr; sharedPlugins["MYRIAD"] = enginePtr;
} }
} }
auto plugin = InferenceEngine::InferencePlugin(enginePtr); auto plugin = InferenceEngine::InferencePlugin(enginePtr);
try try
{ {
auto netExec = plugin.LoadNetwork(cnn, {{"VPU_PLATFORM", "VPU_2480"}}); auto netExec = plugin.LoadNetwork(cnn, {{"VPU_PLATFORM", "VPU_2480"}});
#else
try
{
auto netExec = getCore().LoadNetwork(cnn, "MYRIAD", {{"VPU_PLATFORM", "VPU_2480"}});
#endif
auto infRequest = netExec.CreateInferRequest(); auto infRequest = netExec.CreateInferRequest();
} catch(...) { } catch(...) {
return false; return false;
@ -388,38 +397,41 @@ static bool detectMyriadX_()
} }
#endif // !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT) #endif // !defined(OPENCV_DNN_IE_VPU_TYPE_DEFAULT)
void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) void InfEngineBackendNet::initPlugin(InferenceEngine::CNNNetwork& net)
{ {
CV_Assert(!isInitialized()); CV_Assert(!isInitialized());
try try
{ {
AutoLock lock(getInitializationMutex()); AutoLock lock(getInitializationMutex());
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
auto& sharedPlugins = getSharedPlugins(); auto& sharedPlugins = getSharedPlugins();
auto pluginIt = sharedPlugins.find(targetDevice); auto pluginIt = sharedPlugins.find(device_name);
if (pluginIt != sharedPlugins.end()) if (pluginIt != sharedPlugins.end())
{ {
enginePtr = pluginIt->second; enginePtr = pluginIt->second;
} }
else else
#endif
{ {
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
auto dispatcher = InferenceEngine::PluginDispatcher({""}); auto dispatcher = InferenceEngine::PluginDispatcher({""});
if (targetDevice == InferenceEngine::TargetDevice::eFPGA) if (device_name == "FPGA")
enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU"); enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
else else
enginePtr = dispatcher.getSuitablePlugin(targetDevice); enginePtr = dispatcher.getPluginByDevice(device_name);
sharedPlugins[targetDevice] = enginePtr; sharedPlugins[device_name] = enginePtr;
#else
isInit = true;
#endif
std::vector<std::string> candidates; std::vector<std::string> candidates;
std::string param_pluginPath = utils::getConfigurationParameterString("OPENCV_DNN_IE_EXTRA_PLUGIN_PATH", ""); std::string param_pluginPath = utils::getConfigurationParameterString("OPENCV_DNN_IE_EXTRA_PLUGIN_PATH", "");
if (!param_pluginPath.empty()) if (!param_pluginPath.empty())
{ {
candidates.push_back(param_pluginPath); candidates.push_back(param_pluginPath);
} }
if (targetDevice == InferenceEngine::TargetDevice::eCPU || if (device_name == "CPU" || device_name == "FPGA")
targetDevice == InferenceEngine::TargetDevice::eFPGA)
{ {
std::string suffixes[] = {"_avx2", "_sse4", ""}; std::string suffixes[] = {"_avx2", "_sse4", ""};
bool haveFeature[] = { bool haveFeature[] = {
@ -449,7 +461,12 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
{ {
InferenceEngine::IExtensionPtr extension = InferenceEngine::IExtensionPtr extension =
InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(libName); InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(libName);
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
enginePtr->AddExtension(extension, 0); enginePtr->AddExtension(extension, 0);
#else
getCore().AddExtension(extension, "CPU");
#endif
CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << libName); CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << libName);
found = true; found = true;
break; break;
@ -463,14 +480,24 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
// Some of networks can work without a library of extra layers. // Some of networks can work without a library of extra layers.
#ifndef _WIN32 #ifndef _WIN32
// Limit the number of CPU threads. // Limit the number of CPU threads.
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
enginePtr->SetConfig({{ enginePtr->SetConfig({{
InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()), InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
}}, 0); }}, 0);
#else
if (device_name == "CPU")
getCore().SetConfig({{
InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
}}, device_name);
#endif
#endif #endif
} }
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
plugin = InferenceEngine::InferencePlugin(enginePtr); plugin = InferenceEngine::InferencePlugin(enginePtr);
netExec = plugin.LoadNetwork(net, {}); netExec = plugin.LoadNetwork(net, {});
#else
netExec = getCore().LoadNetwork(net, device_name);
#endif
} }
catch (const std::exception& ex) catch (const std::exception& ex)
{ {
@ -480,7 +507,11 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
bool InfEngineBackendNet::isInitialized() bool InfEngineBackendNet::isInitialized()
{ {
#if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
return (bool)enginePtr; return (bool)enginePtr;
#else
return isInit;
#endif
} }
void InfEngineBackendNet::addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs) void InfEngineBackendNet::addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs)
@ -488,7 +519,7 @@ void InfEngineBackendNet::addBlobs(const std::vector<cv::Ptr<BackendWrapper> >&
auto wrappers = infEngineWrappers(ptrs); auto wrappers = infEngineWrappers(ptrs);
for (const auto& wrapper : wrappers) for (const auto& wrapper : wrappers)
{ {
std::string name = wrapper->dataPtr->name; std::string name = wrapper->dataPtr->getName();
name = name.empty() ? kDefaultInpLayerName : name; name = name.empty() ? kDefaultInpLayerName : name;
allBlobs.insert({name, wrapper->blob}); allBlobs.insert({name, wrapper->blob});
} }
@ -503,7 +534,7 @@ void InfEngineBackendNet::InfEngineReqWrapper::makePromises(const std::vector<Pt
for (int i = 0; i < outs.size(); ++i) for (int i = 0; i < outs.size(); ++i)
{ {
outs[i]->futureMat = outProms[i].getArrayResult(); outs[i]->futureMat = outProms[i].getArrayResult();
outsNames[i] = outs[i]->dataPtr->name; outsNames[i] = outs[i]->dataPtr->getName();
} }
} }
@ -627,11 +658,12 @@ void InfEngineBackendNet::forward(const std::vector<Ptr<BackendWrapper> >& outBl
Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob) Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
{ {
// NOTE: Inference Engine sizes are reversed. // NOTE: Inference Engine sizes are reversed.
std::vector<size_t> dims = blob->dims(); std::vector<size_t> dims = blob->getTensorDesc().getDims();
std::vector<int> size(dims.rbegin(), dims.rend()); std::vector<int> size(dims.begin(), dims.end());
auto precision = blob->getTensorDesc().getPrecision();
int type = -1; int type = -1;
switch (blob->precision()) switch (precision)
{ {
case InferenceEngine::Precision::FP32: type = CV_32F; break; case InferenceEngine::Precision::FP32: type = CV_32F; break;
case InferenceEngine::Precision::U8: type = CV_8U; break; case InferenceEngine::Precision::U8: type = CV_8U; break;
@ -685,7 +717,10 @@ void InfEngineBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArra
InferenceEngine::Blob::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob) InferenceEngine::Blob::Ptr convertFp16(const InferenceEngine::Blob::Ptr& blob)
{ {
auto halfs = InferenceEngine::make_shared_blob<int16_t>(InferenceEngine::Precision::FP16, blob->layout(), blob->dims()); auto halfs = InferenceEngine::make_shared_blob<int16_t>({
InferenceEngine::Precision::FP16, blob->getTensorDesc().getDims(),
blob->getTensorDesc().getLayout()
});
halfs->allocate(); halfs->allocate();
Mat floatsData(1, blob->size(), CV_32F, blob->buffer()); Mat floatsData(1, blob->size(), CV_32F, blob->buffer());
Mat halfsData(1, blob->size(), CV_16SC1, halfs->buffer()); Mat halfsData(1, blob->size(), CV_16SC1, halfs->buffer());
@ -732,7 +767,11 @@ void resetMyriadDevice()
{ {
#ifdef HAVE_INF_ENGINE #ifdef HAVE_INF_ENGINE
AutoLock lock(getInitializationMutex()); AutoLock lock(getInitializationMutex());
getSharedPlugins().erase(InferenceEngine::TargetDevice::eMYRIAD); #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
getSharedPlugins().erase("MYRIAD");
#else
getCore().UnregisterPlugin("MYRIAD");
#endif
#endif // HAVE_INF_ENGINE #endif // HAVE_INF_ENGINE
} }

@ -92,18 +92,22 @@ public:
void forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers, void forward(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
bool isAsync); bool isAsync);
void initPlugin(InferenceEngine::ICNNNetwork& net); void initPlugin(InferenceEngine::CNNNetwork& net);
void addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs); void addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs);
private: private:
InferenceEngine::Builder::Network netBuilder; InferenceEngine::Builder::Network netBuilder;
InferenceEngine::InferenceEnginePluginPtr enginePtr;
InferenceEngine::InferencePlugin plugin;
InferenceEngine::ExecutableNetwork netExec; InferenceEngine::ExecutableNetwork netExec;
InferenceEngine::BlobMap allBlobs; InferenceEngine::BlobMap allBlobs;
InferenceEngine::TargetDevice targetDevice; std::string device_name;
#if INF_ENGINE_VER_MAJOR_LE(2019010000)
InferenceEngine::InferenceEnginePluginPtr enginePtr;
InferenceEngine::InferencePlugin plugin;
#else
bool isInit = false;
#endif
struct InfEngineReqWrapper struct InfEngineReqWrapper
{ {

@ -136,13 +136,10 @@ static const std::vector<std::string> getOpenVINOTestModelsList()
static inline void genData(const std::vector<size_t>& dims, Mat& m, Blob::Ptr& dataPtr) static inline void genData(const std::vector<size_t>& dims, Mat& m, Blob::Ptr& dataPtr)
{ {
std::vector<int> reversedDims(dims.begin(), dims.end()); m.create(std::vector<int>(dims.begin(), dims.end()), CV_32F);
std::reverse(reversedDims.begin(), reversedDims.end());
m.create(reversedDims, CV_32F);
randu(m, -1, 1); randu(m, -1, 1);
dataPtr = make_shared_blob<float>(Precision::FP32, dims, (float*)m.data); dataPtr = make_shared_blob<float>({Precision::FP32, dims, Layout::ANY}, (float*)m.data);
} }
void runIE(Target target, const std::string& xmlPath, const std::string& binPath, void runIE(Target target, const std::string& xmlPath, const std::string& binPath,
@ -154,32 +151,42 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
CNNNetwork net = reader.getNetwork(); CNNNetwork net = reader.getNetwork();
std::string device_name;
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
Core ie;
#else
InferenceEnginePluginPtr enginePtr; InferenceEnginePluginPtr enginePtr;
InferencePlugin plugin; InferencePlugin plugin;
#endif
ExecutableNetwork netExec; ExecutableNetwork netExec;
InferRequest infRequest; InferRequest infRequest;
try try
{ {
auto dispatcher = InferenceEngine::PluginDispatcher({""});
switch (target) switch (target)
{ {
case DNN_TARGET_CPU: case DNN_TARGET_CPU:
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eCPU); device_name = "CPU";
break; break;
case DNN_TARGET_OPENCL: case DNN_TARGET_OPENCL:
case DNN_TARGET_OPENCL_FP16: case DNN_TARGET_OPENCL_FP16:
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eGPU); device_name = "GPU";
break; break;
case DNN_TARGET_MYRIAD: case DNN_TARGET_MYRIAD:
enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eMYRIAD); device_name = "MYRIAD";
break; break;
case DNN_TARGET_FPGA: case DNN_TARGET_FPGA:
enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU"); device_name = "FPGA";
break; break;
default: default:
CV_Error(Error::StsNotImplemented, "Unknown target"); CV_Error(Error::StsNotImplemented, "Unknown target");
}; };
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
auto dispatcher = InferenceEngine::PluginDispatcher({""});
enginePtr = dispatcher.getPluginByDevice(device_name);
#endif
if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA) if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA)
{ {
std::string suffixes[] = {"_avx2", "_sse4", ""}; std::string suffixes[] = {"_avx2", "_sse4", ""};
@ -202,16 +209,23 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
try try
{ {
IExtensionPtr extension = make_so_pointer<IExtension>(libName); IExtensionPtr extension = make_so_pointer<IExtension>(libName);
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
ie.AddExtension(extension, device_name);
#else
enginePtr->AddExtension(extension, 0); enginePtr->AddExtension(extension, 0);
#endif
break; break;
} }
catch(...) {} catch(...) {}
} }
// Some of networks can work without a library of extra layers. // Some of networks can work without a library of extra layers.
} }
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GT(2019010000)
netExec = ie.LoadNetwork(net, device_name);
#else
plugin = InferencePlugin(enginePtr); plugin = InferencePlugin(enginePtr);
netExec = plugin.LoadNetwork(net, {}); netExec = plugin.LoadNetwork(net, {});
#endif
infRequest = netExec.CreateInferRequest(); infRequest = netExec.CreateInferRequest();
} }
catch (const std::exception& ex) catch (const std::exception& ex)
@ -224,7 +238,7 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
BlobMap inputBlobs; BlobMap inputBlobs;
for (auto& it : net.getInputsInfo()) for (auto& it : net.getInputsInfo())
{ {
genData(it.second->getDims(), inputsMap[it.first], inputBlobs[it.first]); genData(it.second->getTensorDesc().getDims(), inputsMap[it.first], inputBlobs[it.first]);
} }
infRequest.SetInput(inputBlobs); infRequest.SetInput(inputBlobs);
@ -233,7 +247,7 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
BlobMap outputBlobs; BlobMap outputBlobs;
for (auto& it : net.getOutputsInfo()) for (auto& it : net.getOutputsInfo())
{ {
genData(it.second->dims, outputsMap[it.first], outputBlobs[it.first]); genData(it.second->getTensorDesc().getDims(), outputsMap[it.first], outputBlobs[it.first]);
} }
infRequest.SetOutput(outputBlobs); infRequest.SetOutput(outputBlobs);

@ -469,6 +469,42 @@ INSTANTIATE_TEST_CASE_P(/**/, Async, Combine(
Values(CV_32F, CV_8U), Values(CV_32F, CV_8U),
testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)) testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))
)); ));
typedef testing::TestWithParam<Target> Test_Model_Optimizer;
TEST_P(Test_Model_Optimizer, forward_two_nets)
{
const int target = GetParam();
const std::string suffix = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "_fp16" : "";
const std::string& model = findDataFile("dnn/layers/layer_convolution" + suffix + ".bin");
const std::string& proto = findDataFile("dnn/layers/layer_convolution" + suffix + ".xml");
Net net0 = readNet(model, proto);
net0.setPreferableTarget(target);
Net net1 = readNet(model, proto);
net1.setPreferableTarget(target);
// Generate inputs.
int blobSize[] = {2, 6, 75, 113};
Mat input(4, &blobSize[0], CV_32F);
randu(input, 0, 255);
net0.setInput(input);
Mat ref0 = net0.forward().clone();
net1.setInput(input);
Mat ref1 = net1.forward();
net0.setInput(input);
Mat ref2 = net0.forward();
normAssert(ref0, ref2, 0, 0);
}
INSTANTIATE_TEST_CASE_P(/**/, Test_Model_Optimizer,
testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE))
);
#endif // HAVE_INF_ENGINE #endif // HAVE_INF_ENGINE
}} // namespace }} // namespace

@ -357,11 +357,9 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
#if defined(INF_ENGINE_RELEASE) #if defined(INF_ENGINE_RELEASE)
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
{ {
#if INF_ENGINE_VER_MAJOR_EQ(2019010000) #if INF_ENGINE_VER_MAJOR_GE(2019020000)
if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
#else
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
#endif #endif
} }
#endif #endif
@ -395,16 +393,10 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
TEST_P(Test_TensorFlow_nets, Inception_v2_SSD) TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
{ {
applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
#if defined(INF_ENGINE_RELEASE) #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD &&
{ getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
#if INF_ENGINE_VER_MAJOR_LE(2019010000) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
if (getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
#else
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
#endif
}
#endif #endif
checkBackend(); checkBackend();
@ -456,12 +448,13 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3; float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3;
#if defined(INF_ENGINE_RELEASE) #if defined(INF_ENGINE_RELEASE)
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD &&
&& getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
) {
scoreDiff = 0.061; scoreDiff = 0.061;
iouDiff = 0.12; iouDiff = 0.12;
detectionConfThresh = 0.36; detectionConfThresh = 0.36;
}
#endif #endif
normAssertDetections(ref, out, "", detectionConfThresh, scoreDiff, iouDiff); normAssertDetections(ref, out, "", detectionConfThresh, scoreDiff, iouDiff);
expectNoFallbacksFromIE(net); expectNoFallbacksFromIE(net);

@ -262,7 +262,7 @@ class Test_Torch_nets : public DNNTestLayer {};
TEST_P(Test_Torch_nets, OpenFace_accuracy) TEST_P(Test_Torch_nets, OpenFace_accuracy)
{ {
#if defined(INF_ENGINE_RELEASE) #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
#endif #endif
@ -287,8 +287,8 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy)
// Reference output values are in range [-0.17212, 0.263492] // Reference output values are in range [-0.17212, 0.263492]
// on Myriad problem layer: l4_Pooling - does not use pads_begin // on Myriad problem layer: l4_Pooling - does not use pads_begin
float l1 = (target == DNN_TARGET_OPENCL_FP16) ? 4e-4 : 1e-5; float l1 = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2e-3 : 1e-5;
float lInf = (target == DNN_TARGET_OPENCL_FP16) ? 1.5e-3 : 1e-3; float lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : 1e-3;
Mat outRef = readTorchBlob(_tf("net_openface_output.dat"), true); Mat outRef = readTorchBlob(_tf("net_openface_output.dat"), true);
normAssert(out, outRef, "", l1, lInf); normAssert(out, outRef, "", l1, lInf);
} }

@ -98,7 +98,7 @@ core = {'': ['absdiff', 'add', 'addWeighted', 'bitwise_and', 'bitwise_not', 'bit
'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen', \ 'compare', 'convertScaleAbs', 'copyMakeBorder', 'countNonZero', 'determinant', 'dft', 'divide', 'eigen', \
'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude', \ 'exp', 'flip', 'getOptimalDFTSize','gemm', 'hconcat', 'inRange', 'invert', 'kmeans', 'log', 'magnitude', \
'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize', \ 'max', 'mean', 'meanStdDev', 'merge', 'min', 'minMaxLoc', 'mixChannels', 'multiply', 'norm', 'normalize', \
'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'setIdentity', 'setRNGSeed', \ 'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed', \
'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat'], 'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat'],
'Algorithm': []} 'Algorithm': []}

@ -941,4 +941,22 @@ QUnit.test('test_filter', function(assert) {
inv3.delete(); inv3.delete();
inv4.delete(); inv4.delete();
} }
//Rotate
{
let dst = new cv.Mat();
let src = cv.matFromArray(3, 2, cv.CV_8U, [1,2,3,4,5,6]);
cv.rotate(src, dst, cv.ROTATE_90_CLOCKWISE);
size = dst.size();
assert.equal(size.height, 2, "ROTATE_HEIGHT");
assert.equal(size.width, 3, "ROTATE_WIGTH");
let expected = new Uint8Array([5,3,1,6,4,2]);
assert.deepEqual(dst.data, expected);
dst.delete();
src.delete();
}
}); });

@ -43,6 +43,7 @@
#include "precomp.hpp" #include "precomp.hpp"
#include "cascadedetect.hpp" #include "cascadedetect.hpp"
#include "opencv2/core/core_c.h" #include "opencv2/core/core_c.h"
#include "opencv2/core/hal/intrin.hpp"
#include "opencl_kernels_objdetect.hpp" #include "opencl_kernels_objdetect.hpp"
#include <cstdio> #include <cstdio>
@ -223,17 +224,6 @@ void HOGDescriptor::copyTo(HOGDescriptor& c) const
c.signedGradient = signedGradient; c.signedGradient = signedGradient;
} }
#if CV_NEON
// replace of _mm_set_ps
inline float32x4_t vsetq_f32(float f0, float f1, float f2, float f3)
{
float32x4_t a = vdupq_n_f32(f0);
a = vsetq_lane_f32(f1, a, 1);
a = vsetq_lane_f32(f2, a, 2);
a = vsetq_lane_f32(f3, a, 3);
return a;
}
#endif
void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, InputOutputArray _qangle, void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, InputOutputArray _qangle,
Size paddingTL, Size paddingBR) const Size paddingTL, Size paddingBR) const
{ {
@ -259,38 +249,22 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
Mat_<float> _lut(1, 256); Mat_<float> _lut(1, 256);
const float* const lut = &_lut(0,0); const float* const lut = &_lut(0,0);
#if CV_SSE2 #if CV_SIMD128
const int indices[] = { 0, 1, 2, 3 }; v_float32x4 idx(0.0f, 1.0f, 2.0f, 3.0f);
__m128i idx = _mm_loadu_si128((const __m128i*)indices); v_float32x4 ifour = v_setall_f32(4.0);
__m128i ifour = _mm_set1_epi32(4);
float* const _data = &_lut(0, 0); float* const _data = &_lut(0, 0);
if( gammaCorrection ) if ( gammaCorrection )
for( i = 0; i < 256; i += 4 ) for ( i = 0; i < 256; i += 4)
{ {
_mm_storeu_ps(_data + i, _mm_sqrt_ps(_mm_cvtepi32_ps(idx))); v_store(_data + i, v_sqrt(idx));
idx = _mm_add_epi32(idx, ifour); idx += ifour;
} }
else else
for( i = 0; i < 256; i += 4 ) for ( i = 0; i < 256; i += 4)
{
_mm_storeu_ps(_data + i, _mm_cvtepi32_ps(idx));
idx = _mm_add_epi32(idx, ifour);
}
#elif CV_NEON
const int indices[] = { 0, 1, 2, 3 };
uint32x4_t idx = *(uint32x4_t*)indices;
uint32x4_t ifour = vdupq_n_u32(4);
float* const _data = &_lut(0, 0);
if( gammaCorrection )
for( i = 0; i < 256; i++ )
_lut(0,i) = std::sqrt((float)i);
else
for( i = 0; i < 256; i += 4 )
{ {
vst1q_f32(_data + i, vcvtq_f32_u32(idx)); v_store(_data + i, idx);
idx = vaddq_u32 (idx, ifour); idx += ifour;
} }
#else #else
if( gammaCorrection ) if( gammaCorrection )
@ -327,17 +301,13 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
{ {
int end = gradsize.width + 2; int end = gradsize.width + 2;
xmap -= 1, x = 0; xmap -= 1, x = 0;
#if CV_SSE2 #if CV_SIMD128
for ( ; x <= end - 4; x += 4) for ( ; x <= end - 4; x += 4)
{ {
__m128i mul_res = _mm_loadu_si128((const __m128i*)(xmap + x)); v_int32x4 mul_res = v_load(xmap + x);
mul_res = _mm_add_epi32(_mm_add_epi32(mul_res, mul_res), mul_res); // multiply by 3 mul_res += mul_res + mul_res;
_mm_storeu_si128((__m128i*)(xmap + x), mul_res); v_store(xmap + x, mul_res);
} }
#elif CV_NEON
int32x4_t ithree = vdupq_n_s32(3);
for ( ; x <= end - 4; x += 4)
vst1q_s32(xmap + x, vmulq_s32(ithree, vld1q_s32(xmap + x)));
#endif #endif
for ( ; x < end; ++x) for ( ; x < end; ++x)
xmap[x] *= 3; xmap[x] *= 3;
@ -368,46 +338,7 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
else else
{ {
x = 0; x = 0;
#if CV_SSE2 #if CV_SIMD128
for( ; x <= width - 4; x += 4 )
{
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
typedef const uchar* const T;
T p02 = imgPtr + xmap[x+1], p00 = imgPtr + xmap[x-1];
T p12 = imgPtr + xmap[x+2], p10 = imgPtr + xmap[x];
T p22 = imgPtr + xmap[x+3], p20 = p02;
T p32 = imgPtr + xmap[x+4], p30 = p12;
__m128 _dx0 = _mm_sub_ps(_mm_set_ps(lut[p32[0]], lut[p22[0]], lut[p12[0]], lut[p02[0]]),
_mm_set_ps(lut[p30[0]], lut[p20[0]], lut[p10[0]], lut[p00[0]]));
__m128 _dx1 = _mm_sub_ps(_mm_set_ps(lut[p32[1]], lut[p22[1]], lut[p12[1]], lut[p02[1]]),
_mm_set_ps(lut[p30[1]], lut[p20[1]], lut[p10[1]], lut[p00[1]]));
__m128 _dx2 = _mm_sub_ps(_mm_set_ps(lut[p32[2]], lut[p22[2]], lut[p12[2]], lut[p02[2]]),
_mm_set_ps(lut[p30[2]], lut[p20[2]], lut[p10[2]], lut[p00[2]]));
__m128 _dy0 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3]], lut[nextPtr[x2]], lut[nextPtr[x1]], lut[nextPtr[x0]]),
_mm_set_ps(lut[prevPtr[x3]], lut[prevPtr[x2]], lut[prevPtr[x1]], lut[prevPtr[x0]]));
__m128 _dy1 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+1]], lut[nextPtr[x2+1]], lut[nextPtr[x1+1]], lut[nextPtr[x0+1]]),
_mm_set_ps(lut[prevPtr[x3+1]], lut[prevPtr[x2+1]], lut[prevPtr[x1+1]], lut[prevPtr[x0+1]]));
__m128 _dy2 = _mm_sub_ps(_mm_set_ps(lut[nextPtr[x3+2]], lut[nextPtr[x2+2]], lut[nextPtr[x1+2]], lut[nextPtr[x0+2]]),
_mm_set_ps(lut[prevPtr[x3+2]], lut[prevPtr[x2+2]], lut[prevPtr[x1+2]], lut[prevPtr[x0+2]]));
__m128 _mag0 = _mm_add_ps(_mm_mul_ps(_dx0, _dx0), _mm_mul_ps(_dy0, _dy0));
__m128 _mag1 = _mm_add_ps(_mm_mul_ps(_dx1, _dx1), _mm_mul_ps(_dy1, _dy1));
__m128 _mag2 = _mm_add_ps(_mm_mul_ps(_dx2, _dx2), _mm_mul_ps(_dy2, _dy2));
__m128 mask = _mm_cmpgt_ps(_mag2, _mag1);
_dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx1));
_dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy1));
mask = _mm_cmpgt_ps(_mm_max_ps(_mag2, _mag1), _mag0);
_dx2 = _mm_or_ps(_mm_and_ps(_dx2, mask), _mm_andnot_ps(mask, _dx0));
_dy2 = _mm_or_ps(_mm_and_ps(_dy2, mask), _mm_andnot_ps(mask, _dy0));
_mm_storeu_ps(dbuf + x, _dx2);
_mm_storeu_ps(dbuf + x + width, _dy2);
}
#elif CV_NEON
for( ; x <= width - 4; x += 4 ) for( ; x <= width - 4; x += 4 )
{ {
int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3]; int x0 = xmap[x], x1 = xmap[x+1], x2 = xmap[x+2], x3 = xmap[x+3];
@ -417,34 +348,34 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
T p22 = imgPtr + xmap[x+3], p20 = p02; T p22 = imgPtr + xmap[x+3], p20 = p02;
T p32 = imgPtr + xmap[x+4], p30 = p12; T p32 = imgPtr + xmap[x+4], p30 = p12;
float32x4_t _dx0 = vsubq_f32(vsetq_f32(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]), v_float32x4 _dx0 = v_float32x4(lut[p02[0]], lut[p12[0]], lut[p22[0]], lut[p32[0]]) -
vsetq_f32(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]])); v_float32x4(lut[p00[0]], lut[p10[0]], lut[p20[0]], lut[p30[0]]);
float32x4_t _dx1 = vsubq_f32(vsetq_f32(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]), v_float32x4 _dx1 = v_float32x4(lut[p02[1]], lut[p12[1]], lut[p22[1]], lut[p32[1]]) -
vsetq_f32(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]])); v_float32x4(lut[p00[1]], lut[p10[1]], lut[p20[1]], lut[p30[1]]);
float32x4_t _dx2 = vsubq_f32(vsetq_f32(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]), v_float32x4 _dx2 = v_float32x4(lut[p02[2]], lut[p12[2]], lut[p22[2]], lut[p32[2]]) -
vsetq_f32(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]])); v_float32x4(lut[p00[2]], lut[p10[2]], lut[p20[2]], lut[p30[2]]);
float32x4_t _dy0 = vsubq_f32(vsetq_f32(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]), v_float32x4 _dy0 = v_float32x4(lut[nextPtr[x0]], lut[nextPtr[x1]], lut[nextPtr[x2]], lut[nextPtr[x3]]) -
vsetq_f32(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]])); v_float32x4(lut[prevPtr[x0]], lut[prevPtr[x1]], lut[prevPtr[x2]], lut[prevPtr[x3]]);
float32x4_t _dy1 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]), v_float32x4 _dy1 = v_float32x4(lut[nextPtr[x0+1]], lut[nextPtr[x1+1]], lut[nextPtr[x2+1]], lut[nextPtr[x3+1]]) -
vsetq_f32(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]])); v_float32x4(lut[prevPtr[x0+1]], lut[prevPtr[x1+1]], lut[prevPtr[x2+1]], lut[prevPtr[x3+1]]);
float32x4_t _dy2 = vsubq_f32(vsetq_f32(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]), v_float32x4 _dy2 = v_float32x4(lut[nextPtr[x0+2]], lut[nextPtr[x1+2]], lut[nextPtr[x2+2]], lut[nextPtr[x3+2]]) -
vsetq_f32(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]])); v_float32x4(lut[prevPtr[x0+2]], lut[prevPtr[x1+2]], lut[prevPtr[x2+2]], lut[prevPtr[x3+2]]);
float32x4_t _mag0 = vaddq_f32(vmulq_f32(_dx0, _dx0), vmulq_f32(_dy0, _dy0)); v_float32x4 _mag0 = (_dx0 * _dx0) + (_dy0 * _dy0);
float32x4_t _mag1 = vaddq_f32(vmulq_f32(_dx1, _dx1), vmulq_f32(_dy1, _dy1)); v_float32x4 _mag1 = (_dx1 * _dx1) + (_dy1 * _dy1);
float32x4_t _mag2 = vaddq_f32(vmulq_f32(_dx2, _dx2), vmulq_f32(_dy2, _dy2)); v_float32x4 _mag2 = (_dx2 * _dx2) + (_dy2 * _dy2);
uint32x4_t mask = vcgtq_f32(_mag2, _mag1); v_float32x4 mask = v_reinterpret_as_f32(_mag2 > _mag1);
_dx2 = vbslq_f32(mask, _dx2, _dx1); _dx2 = v_select(mask, _dx2, _dx1);
_dy2 = vbslq_f32(mask, _dy2, _dy1); _dy2 = v_select(mask, _dy2, _dy1);
mask = vcgtq_f32(vmaxq_f32(_mag2, _mag1), _mag0); mask = v_reinterpret_as_f32(v_max(_mag2, _mag1) > _mag0);
_dx2 = vbslq_f32(mask, _dx2, _dx0); _dx2 = v_select(mask, _dx2, _dx0);
_dy2 = vbslq_f32(mask, _dy2, _dy0); _dy2 = v_select(mask, _dy2, _dy0);
vst1q_f32(dbuf + x, _dx2); v_store(dbuf + x, _dx2);
vst1q_f32(dbuf + x + width, _dy2); v_store(dbuf + x + width, _dy2);
} }
#endif #endif
for( ; x < width; x++ ) for( ; x < width; x++ )
@ -488,44 +419,40 @@ void HOGDescriptor::computeGradient(InputArray _img, InputOutputArray _grad, Inp
// filling the result matrix // filling the result matrix
x = 0; x = 0;
#if CV_SSE2 #if CV_SIMD128
__m128 fhalf = _mm_set1_ps(0.5f), fzero = _mm_setzero_ps(); v_float32x4 fhalf = v_setall_f32(0.5f);
__m128 _angleScale = _mm_set1_ps(angleScale), fone = _mm_set1_ps(1.0f); v_float32x4 _angleScale = v_setall_f32(angleScale), fone = v_setall_f32(1.0f);
__m128i ione = _mm_set1_epi32(1), _nbins = _mm_set1_epi32(nbins), izero = _mm_setzero_si128(); v_int32x4 ione = v_setall_s32(1), _nbins = v_setall_s32(nbins), izero = v_setzero_s32();
for ( ; x <= width - 4; x += 4) for ( ; x <= width - 4; x += 4)
{ {
int x2 = x << 1; int x2 = x << 1;
__m128 _mag = _mm_loadu_ps(dbuf + x + (width << 1)); v_float32x4 _mag = v_load(dbuf + x + (width << 1));
__m128 _angle = _mm_loadu_ps(dbuf + x + width * 3); v_float32x4 _angle = v_load(dbuf + x + width * 3);
_angle = _mm_sub_ps(_mm_mul_ps(_angleScale, _angle), fhalf); _angle = (_angleScale * _angle) - fhalf;
__m128 sign = _mm_and_ps(fone, _mm_cmplt_ps(_angle, fzero)); v_int32x4 _hidx = v_floor(_angle);
__m128i _hidx = _mm_cvttps_epi32(_angle); _angle -= v_cvt_f32(_hidx);
_hidx = _mm_sub_epi32(_hidx, _mm_cvtps_epi32(sign));
_angle = _mm_sub_ps(_angle, _mm_cvtepi32_ps(_hidx)); v_float32x4 ft0 = _mag * (fone - _angle);
v_float32x4 ft1 = _mag * _angle;
__m128 ft0 = _mm_mul_ps(_mag, _mm_sub_ps(fone, _angle));
__m128 ft1 = _mm_mul_ps(_mag, _angle); v_store_interleave(gradPtr + x2, ft0, ft1);
__m128 ft2 = _mm_unpacklo_ps(ft0, ft1);
__m128 ft3 = _mm_unpackhi_ps(ft0, ft1); v_int32x4 mask0 = _hidx >> 31;
v_int32x4 it0 = mask0 & _nbins;
_mm_storeu_ps(gradPtr + x2, ft2); mask0 = (_hidx >= _nbins);
_mm_storeu_ps(gradPtr + x2 + 4, ft3); v_int32x4 it1 = mask0 & _nbins;
_hidx += (it0 - it1);
__m128i mask0 = _mm_sub_epi32(izero, _mm_srli_epi32(_hidx, 31));
__m128i it0 = _mm_and_si128(mask0, _nbins); it0 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
mask0 = _mm_cmplt_epi32(_hidx, _nbins); _hidx += ione;
__m128i it1 = _mm_andnot_si128(mask0, _nbins); _hidx &= (_hidx < _nbins);
_hidx = _mm_add_epi32(_hidx, _mm_sub_epi32(it0, it1)); it1 = v_reinterpret_as_s32(v_pack(v_pack(_hidx, izero), v_reinterpret_as_s16(izero)));
v_uint8x16 it2, it3;
it0 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero); v_zip(v_reinterpret_as_u8(it0), v_reinterpret_as_u8(it1), it2, it3);
_hidx = _mm_add_epi32(ione, _hidx);
_hidx = _mm_and_si128(_hidx, _mm_cmplt_epi32(_hidx, _nbins)); v_store_low(qanglePtr + x2, it2);
it1 = _mm_packus_epi16(_mm_packs_epi32(_hidx, izero), izero);
it0 = _mm_unpacklo_epi8(it0, it1);
_mm_storel_epi64((__m128i*)(qanglePtr + x2), it0);
} }
#endif #endif
for( ; x < width; x++ ) for( ; x < width; x++ )
@ -665,31 +592,17 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
float bh = blockSize.height * 0.5f, bw = blockSize.width * 0.5f; float bh = blockSize.height * 0.5f, bw = blockSize.width * 0.5f;
i = 0; i = 0;
#if CV_SSE2 #if CV_SIMD128
const int a[] = { 0, 1, 2, 3 }; v_float32x4 idx(0.0f, 1.0f, 2.0f, 3.0f);
__m128i idx = _mm_loadu_si128((__m128i*)a); v_float32x4 _bw = v_setall_f32(bw), _bh = v_setall_f32(bh);
__m128 _bw = _mm_set1_ps(bw), _bh = _mm_set1_ps(bh); v_float32x4 ifour = v_setall_f32(4.0);
__m128i ifour = _mm_set1_epi32(4);
for (; i <= blockSize.height - 4; i += 4)
{
__m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bh);
t = _mm_mul_ps(t, t);
idx = _mm_add_epi32(idx, ifour);
_mm_storeu_ps(_di + i, t);
}
#elif CV_NEON
const int a[] = { 0, 1, 2, 3 };
int32x4_t idx = vld1q_s32(a);
float32x4_t _bw = vdupq_n_f32(bw), _bh = vdupq_n_f32(bh);
int32x4_t ifour = vdupq_n_s32(4);
for (; i <= blockSize.height - 4; i += 4) for (; i <= blockSize.height - 4; i += 4)
{ {
float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bh); v_float32x4 t = idx - _bh;
t = vmulq_f32(t, t); t *= t;
idx = vaddq_s32(idx, ifour); idx += ifour;
vst1q_f32(_di + i, t); v_store(_di + i, t);
} }
#endif #endif
for ( ; i < blockSize.height; ++i) for ( ; i < blockSize.height; ++i)
@ -699,23 +612,15 @@ void HOGCache::init(const HOGDescriptor* _descriptor,
} }
j = 0; j = 0;
#if CV_SSE2 #if CV_SIMD128
idx = _mm_loadu_si128((__m128i*)a); idx = v_float32x4(0.0f, 1.0f, 2.0f, 3.0f);
for (; j <= blockSize.width - 4; j += 4)
{ for (; j <= blockSize.height - 4; j += 4)
__m128 t = _mm_sub_ps(_mm_cvtepi32_ps(idx), _bw);
t = _mm_mul_ps(t, t);
idx = _mm_add_epi32(idx, ifour);
_mm_storeu_ps(_dj + j, t);
}
#elif CV_NEON
idx = vld1q_s32(a);
for (; j <= blockSize.width - 4; j += 4)
{ {
float32x4_t t = vsubq_f32(vcvtq_f32_s32(idx), _bw); v_float32x4 t = idx - _bw;
t = vmulq_f32(t, t); t *= t;
idx = vaddq_s32(idx, ifour); idx += ifour;
vst1q_f32(_dj + j, t); v_store(_dj + j, t);
} }
#endif #endif
for ( ; j < blockSize.width; ++j) for ( ; j < blockSize.width; ++j)
@ -913,7 +818,7 @@ const float* HOGCache::getBlock(Point pt, float* buf)
hist[h0] = t0; hist[h1] = t1; hist[h0] = t0; hist[h1] = t1;
} }
#if CV_SSE2 #if CV_SIMD128
float hist0[4], hist1[4]; float hist0[4], hist1[4];
for( ; k < C2; k++ ) for( ; k < C2; k++ )
{ {
@ -922,12 +827,12 @@ const float* HOGCache::getBlock(Point pt, float* buf)
const uchar* const h = qanglePtr + pk.qangleOfs; const uchar* const h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1]; int h0 = h[0], h1 = h[1];
__m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]); v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
__m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights)); v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
__m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w); v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
_mm_storeu_ps(hist0, _t0); v_store(hist0, _t0);
_mm_storeu_ps(hist1, _t1); v_store(hist1, _t1);
float* hist = blockHist + pk.histOfs[0]; float* hist = blockHist + pk.histOfs[0];
float t0 = hist[h0] + hist0[0]; float t0 = hist[h0] + hist0[0];
@ -939,31 +844,6 @@ const float* HOGCache::getBlock(Point pt, float* buf)
t1 = hist[h1] + hist1[1]; t1 = hist[h1] + hist1[1];
hist[h0] = t0; hist[h1] = t1; hist[h0] = t0; hist[h1] = t1;
} }
#elif CV_NEON
float hist0[4], hist1[4];
for( ; k < C2; k++ )
{
const PixData& pk = _pixData[k];
const float* const a = gradPtr + pk.gradOfs;
const uchar* const h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1];
float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]);
float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights));
float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0], (blockHist + pk.histOfs[1])[h0], 0, 0);
float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1], (blockHist + pk.histOfs[1])[h1], 0, 0);
float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w);
vst1q_f32(hist0, _t0);
vst1q_f32(hist1, _t1);
(blockHist + pk.histOfs[0])[h0] = hist0[0];
(blockHist + pk.histOfs[1])[h0] = hist0[1];
(blockHist + pk.histOfs[0])[h1] = hist1[0];
(blockHist + pk.histOfs[1])[h1] = hist1[1];
}
#else #else
for( ; k < C2; k++ ) for( ; k < C2; k++ )
{ {
@ -987,7 +867,7 @@ const float* HOGCache::getBlock(Point pt, float* buf)
} }
#endif #endif
#if CV_SSE2 #if CV_SIMD128
for( ; k < C4; k++ ) for( ; k < C4; k++ )
{ {
const PixData& pk = _pixData[k]; const PixData& pk = _pixData[k];
@ -995,12 +875,12 @@ const float* HOGCache::getBlock(Point pt, float* buf)
const uchar* const h = qanglePtr + pk.qangleOfs; const uchar* const h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1]; int h0 = h[0], h1 = h[1];
__m128 _a0 = _mm_set1_ps(a[0]), _a1 = _mm_set1_ps(a[1]); v_float32x4 _a0 = v_setall_f32(a[0]), _a1 = v_setall_f32(a[1]);
__m128 _w = _mm_mul_ps(_mm_set1_ps(pk.gradWeight), _mm_loadu_ps(pk.histWeights)); v_float32x4 w = v_setall_f32(pk.gradWeight) * v_load(pk.histWeights);
__m128 _t0 = _mm_mul_ps(_a0, _w), _t1 = _mm_mul_ps(_a1, _w); v_float32x4 _t0 = _a0 * w, _t1 = _a1 * w;
_mm_storeu_ps(hist0, _t0); v_store(hist0, _t0);
_mm_storeu_ps(hist1, _t1); v_store(hist1, _t1);
float* hist = blockHist + pk.histOfs[0]; float* hist = blockHist + pk.histOfs[0];
float t0 = hist[h0] + hist0[0]; float t0 = hist[h0] + hist0[0];
@ -1021,62 +901,6 @@ const float* HOGCache::getBlock(Point pt, float* buf)
t0 = hist[h0] + hist0[3]; t0 = hist[h0] + hist0[3];
t1 = hist[h1] + hist1[3]; t1 = hist[h1] + hist1[3];
hist[h0] = t0; hist[h1] = t1; hist[h0] = t0; hist[h1] = t1;
// __m128 _hist0 = _mm_set_ps((blockHist + pk.histOfs[3])[h0], (blockHist + pk.histOfs[2])[h0],
// (blockHist + pk.histOfs[1])[h0], (blockHist + pk.histOfs[0])[h0]);
// __m128 _hist1 = _mm_set_ps((blockHist + pk.histOfs[3])[h1], (blockHist + pk.histOfs[2])[h1],
// (blockHist + pk.histOfs[1])[h1], (blockHist + pk.histOfs[0])[h1]);
//
// _hist0 = _mm_add_ps(_t0, _hist0);
// _hist1 = _mm_add_ps(_t1, _hist1);
//
// _mm_storeu_ps(hist0, _hist0);
// _mm_storeu_ps(hist1, _hist1);
//
// (pk.histOfs[0] + blockHist)[h0] = hist0[0];
// (pk.histOfs[1] + blockHist)[h0] = hist0[1];
// (pk.histOfs[2] + blockHist)[h0] = hist0[2];
// (pk.histOfs[3] + blockHist)[h0] = hist0[3];
//
// (pk.histOfs[0] + blockHist)[h1] = hist1[0];
// (pk.histOfs[1] + blockHist)[h1] = hist1[1];
// (pk.histOfs[2] + blockHist)[h1] = hist1[2];
// (pk.histOfs[3] + blockHist)[h1] = hist1[3];
}
#elif CV_NEON
for( ; k < C4; k++ )
{
const PixData& pk = _pixData[k];
const float* const a = gradPtr + pk.gradOfs;
const uchar* const h = qanglePtr + pk.qangleOfs;
int h0 = h[0], h1 = h[1];
float32x4_t _a0 = vdupq_n_f32(a[0]), _a1 = vdupq_n_f32(a[1]);
float32x4_t _w = vmulq_f32(vdupq_n_f32(pk.gradWeight), vld1q_f32(pk.histWeights));
float32x4_t _h0 = vsetq_f32((blockHist + pk.histOfs[0])[h0],
(blockHist + pk.histOfs[1])[h0],
(blockHist + pk.histOfs[2])[h0],
(blockHist + pk.histOfs[3])[h0]);
float32x4_t _h1 = vsetq_f32((blockHist + pk.histOfs[0])[h1],
(blockHist + pk.histOfs[1])[h1],
(blockHist + pk.histOfs[2])[h1],
(blockHist + pk.histOfs[3])[h1]);
float32x4_t _t0 = vmlaq_f32(_h0, _a0, _w), _t1 = vmlaq_f32(_h1, _a1, _w);
vst1q_f32(hist0, _t0);
vst1q_f32(hist1, _t1);
(blockHist + pk.histOfs[0])[h0] = hist0[0];
(blockHist + pk.histOfs[1])[h0] = hist0[1];
(blockHist + pk.histOfs[2])[h0] = hist0[2];
(blockHist + pk.histOfs[3])[h0] = hist0[3];
(blockHist + pk.histOfs[0])[h1] = hist1[0];
(blockHist + pk.histOfs[1])[h1] = hist1[1];
(blockHist + pk.histOfs[2])[h1] = hist1[2];
(blockHist + pk.histOfs[3])[h1] = hist1[3];
} }
#else #else
for( ; k < C4; k++ ) for( ; k < C4; k++ )
@ -1123,26 +947,16 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
float* hist = &_hist[0], sum = 0.0f, partSum[4]; float* hist = &_hist[0], sum = 0.0f, partSum[4];
size_t i = 0, sz = blockHistogramSize; size_t i = 0, sz = blockHistogramSize;
#if CV_SSE2 #if CV_SIMD128
__m128 p0 = _mm_loadu_ps(hist); v_float32x4 p0 = v_load(hist);
__m128 s = _mm_mul_ps(p0, p0); v_float32x4 s = p0 * p0;
for (i = 4; i <= sz - 4; i += 4) for (i = 4; i <= sz - 4; i += 4)
{ {
p0 = _mm_loadu_ps(hist + i); p0 = v_load(hist + i);
s = _mm_add_ps(s, _mm_mul_ps(p0, p0)); s += p0 * p0;
} }
_mm_storeu_ps(partSum, s); v_store(partSum, s);
#elif CV_NEON
float32x4_t p0 = vld1q_f32(hist);
float32x4_t s = vmulq_f32(p0, p0);
for (i = 4; i <= sz - 4; i += 4)
{
p0 = vld1q_f32(hist + i);
s = vaddq_f32(s, vmulq_f32(p0, p0));
}
vst1q_f32(partSum, s);
#else #else
partSum[0] = 0.0f; partSum[0] = 0.0f;
partSum[1] = 0.0f; partSum[1] = 0.0f;
@ -1165,44 +979,25 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold; float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)descriptor->L2HysThreshold;
i = 0, sum = 0.0f; i = 0, sum = 0.0f;
#if CV_SSE2 #if CV_SIMD128
__m128 _scale = _mm_set1_ps(scale); v_float32x4 _scale = v_setall_f32(scale);
static __m128 _threshold = _mm_set1_ps(thresh); static v_float32x4 _threshold = v_setall_f32(thresh);
__m128 p = _mm_mul_ps(_scale, _mm_loadu_ps(hist)); v_float32x4 p = _scale * v_load(hist);
p = _mm_min_ps(p, _threshold); p = v_min(p, _threshold);
s = _mm_mul_ps(p, p); s = p * p;
_mm_storeu_ps(hist, p); v_store(hist, p);
for(i = 4 ; i <= sz - 4; i += 4) for(i = 4 ; i <= sz - 4; i += 4)
{ {
p = _mm_loadu_ps(hist + i); p = v_load(hist + i);
p = _mm_mul_ps(p, _scale); p *= _scale;
p = _mm_min_ps(p, _threshold); p = v_min(p, _threshold);
s = _mm_add_ps(s, _mm_mul_ps(p, p)); s += p * p;
_mm_storeu_ps(hist + i, p); v_store(hist + i, p);
} }
_mm_storeu_ps(partSum, s); v_store(partSum, s);
#elif CV_NEON
float32x4_t _scale = vdupq_n_f32(scale);
static float32x4_t _threshold = vdupq_n_f32(thresh);
float32x4_t p = vmulq_f32(_scale, vld1q_f32(hist));
p = vminq_f32(p, _threshold);
s = vmulq_f32(p, p);
vst1q_f32(hist, p);
for(i = 4 ; i <= sz - 4; i += 4)
{
p = vld1q_f32(hist + i);
p = vmulq_f32(p, _scale);
p = vminq_f32(p, _threshold);
s = vaddq_f32(s, vmulq_f32(p, p));
vst1q_f32(hist + i, p);
}
vst1q_f32(partSum, s);
#else #else
partSum[0] = 0.0f; partSum[0] = 0.0f;
partSum[1] = 0.0f; partSum[1] = 0.0f;
@ -1230,19 +1025,12 @@ void HOGCache::normalizeBlockHistogram(float* _hist) const
} }
scale = 1.f/(std::sqrt(sum)+1e-3f), i = 0; scale = 1.f/(std::sqrt(sum)+1e-3f), i = 0;
#if CV_SSE2 #if CV_SIMD128
__m128 _scale2 = _mm_set1_ps(scale); v_float32x4 _scale2 = v_setall_f32(scale);
for ( ; i <= sz - 4; i += 4)
{
__m128 t = _mm_mul_ps(_scale2, _mm_loadu_ps(hist + i));
_mm_storeu_ps(hist + i, t);
}
#elif CV_NEON
float32x4_t _scale2 = vdupq_n_f32(scale);
for ( ; i <= sz - 4; i += 4) for ( ; i <= sz - 4; i += 4)
{ {
float32x4_t t = vmulq_f32(_scale2, vld1q_f32(hist + i)); v_float32x4 t = _scale2 * v_load(hist + i);
vst1q_f32(hist + i, t); v_store(hist + i, t);
} }
#endif #endif
for ( ; i < sz; ++i) for ( ; i < sz; ++i)
@ -1690,7 +1478,7 @@ void HOGDescriptor::detect(InputArray _img,
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0; double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
std::vector<float> blockHist(blockHistogramSize); std::vector<float> blockHist(blockHistogramSize);
#if CV_SSE2 || CV_NEON #if CV_SIMD128
float partSum[4]; float partSum[4];
#endif #endif
@ -1719,37 +1507,20 @@ void HOGDescriptor::detect(InputArray _img,
Point pt = pt0 + bj.imgOffset; Point pt = pt0 + bj.imgOffset;
const float* vec = cache.getBlock(pt, &blockHist[0]); const float* vec = cache.getBlock(pt, &blockHist[0]);
#if CV_SSE2 #if CV_SIMD128
__m128 _vec = _mm_loadu_ps(vec); v_float32x4 _vec = v_load(vec);
__m128 _svmVec = _mm_loadu_ps(svmVec); v_float32x4 _svmVec = v_load(svmVec);
__m128 sum = _mm_mul_ps(_svmVec, _vec); v_float32x4 sum = _svmVec * _vec;
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
{
_vec = _mm_loadu_ps(vec + k);
_svmVec = _mm_loadu_ps(svmVec + k);
sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec));
}
_mm_storeu_ps(partSum, sum);
double t0 = partSum[0] + partSum[1];
double t1 = partSum[2] + partSum[3];
s += t0 + t1;
#elif CV_NEON
float32x4_t _vec = vld1q_f32(vec);
float32x4_t _svmVec = vld1q_f32(svmVec);
float32x4_t sum = vmulq_f32(_svmVec, _vec);
for( k = 4; k <= blockHistogramSize - 4; k += 4 ) for( k = 4; k <= blockHistogramSize - 4; k += 4 )
{ {
_vec = vld1q_f32(vec + k); _vec = v_load(vec + k);
_svmVec = vld1q_f32(svmVec + k); _svmVec = v_load(svmVec + k);
sum = vaddq_f32(sum, vmulq_f32(_vec, _svmVec)); sum += _vec * _svmVec;
} }
vst1q_f32(partSum, sum); v_store(partSum, sum);
double t0 = partSum[0] + partSum[1]; double t0 = partSum[0] + partSum[1];
double t1 = partSum[2] + partSum[3]; double t1 = partSum[2] + partSum[3];
s += t0 + t1; s += t0 + t1;
@ -3530,7 +3301,7 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0; double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
std::vector<float> blockHist(blockHistogramSize); std::vector<float> blockHist(blockHistogramSize);
#if CV_SSE2 || CV_NEON #if CV_SIMD128
float partSum[4]; float partSum[4];
#endif #endif
@ -3557,37 +3328,21 @@ void HOGDescriptor::detectROI(InputArray _img, const std::vector<cv::Point> &loc
// need to divide this into 4 parts! // need to divide this into 4 parts!
const float* vec = cache.getBlock(pt, &blockHist[0]); const float* vec = cache.getBlock(pt, &blockHist[0]);
#if CV_SSE2 #if CV_SIMD128
__m128 _vec = _mm_loadu_ps(vec); v_float32x4 _vec = v_load(vec);
__m128 _svmVec = _mm_loadu_ps(svmVec); v_float32x4 _svmVec = v_load(svmVec);
__m128 sum = _mm_mul_ps(_svmVec, _vec); v_float32x4 sum = _svmVec * _vec;
for( k = 4; k <= blockHistogramSize - 4; k += 4 ) for( k = 4; k <= blockHistogramSize - 4; k += 4 )
{ {
_vec = _mm_loadu_ps(vec + k); _vec = v_load(vec + k);
_svmVec = _mm_loadu_ps(svmVec + k); _svmVec = v_load(svmVec + k);
sum = _mm_add_ps(sum, _mm_mul_ps(_vec, _svmVec)); sum += _vec * _svmVec;
} }
_mm_storeu_ps(partSum, sum); v_store(partSum, sum);
double t0 = partSum[0] + partSum[1];
double t1 = partSum[2] + partSum[3];
s += t0 + t1;
#elif CV_NEON
float32x4_t _vec = vld1q_f32(vec);
float32x4_t _svmVec = vld1q_f32(svmVec);
float32x4_t sum = vmulq_f32(_svmVec, _vec);
for( k = 4; k <= blockHistogramSize - 4; k += 4 )
{
_vec = vld1q_f32(vec + k);
_svmVec = vld1q_f32(svmVec + k);
sum = vaddq_f32(sum, vmulq_f32(_vec, _svmVec));
}
vst1q_f32(partSum, sum);
double t0 = partSum[0] + partSum[1]; double t0 = partSum[0] + partSum[1];
double t1 = partSum[2] + partSum[3]; double t1 = partSum[2] + partSum[3];
s += t0 + t1; s += t0 + t1;

@ -1,7 +1,7 @@
set(the_description "Images stitching") set(the_description "Images stitching")
if(HAVE_CUDA) if(HAVE_CUDA)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wstrict-aliasing)
endif() endif()
set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d") set(STITCHING_CONTRIB_DEPS "opencv_xfeatures2d")

@ -499,7 +499,7 @@ struct CvCapture_FFMPEG
double r2d(AVRational r) const; double r2d(AVRational r) const;
int64_t dts_to_frame_number(int64_t dts); int64_t dts_to_frame_number(int64_t dts);
double dts_to_sec(int64_t dts); double dts_to_sec(int64_t dts) const;
AVFormatContext * ic; AVFormatContext * ic;
AVCodec * avcodec; AVCodec * avcodec;
@ -892,7 +892,14 @@ bool CvCapture_FFMPEG::open( const char* _filename )
#else #else
av_dict_set(&dict, "rtsp_transport", "tcp", 0); av_dict_set(&dict, "rtsp_transport", "tcp", 0);
#endif #endif
int err = avformat_open_input(&ic, _filename, NULL, &dict); AVInputFormat* input_format = NULL;
AVDictionaryEntry* entry = av_dict_get(dict, "input_format", NULL, 0);
if (entry != 0)
{
input_format = av_find_input_format(entry->value);
}
int err = avformat_open_input(&ic, _filename, input_format, &dict);
#else #else
int err = av_open_input_file(&ic, _filename, NULL, 0, NULL); int err = av_open_input_file(&ic, _filename, NULL, 0, NULL);
#endif #endif
@ -1168,7 +1175,11 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
switch( property_id ) switch( property_id )
{ {
case CAP_PROP_POS_MSEC: case CAP_PROP_POS_MSEC:
return 1000.0*(double)frame_number/get_fps(); if (picture_pts == AV_NOPTS_VALUE_)
{
return 0;
}
return (dts_to_sec(picture_pts) * 1000);
case CAP_PROP_POS_FRAMES: case CAP_PROP_POS_FRAMES:
return (double)frame_number; return (double)frame_number;
case CAP_PROP_POS_AVI_RATIO: case CAP_PROP_POS_AVI_RATIO:
@ -1278,7 +1289,7 @@ int64_t CvCapture_FFMPEG::dts_to_frame_number(int64_t dts)
return (int64_t)(get_fps() * sec + 0.5); return (int64_t)(get_fps() * sec + 0.5);
} }
double CvCapture_FFMPEG::dts_to_sec(int64_t dts) double CvCapture_FFMPEG::dts_to_sec(int64_t dts) const
{ {
return (double)(dts - ic->streams[video_stream]->start_time) * return (double)(dts - ic->streams[video_stream]->start_time) *
r2d(ic->streams[video_stream]->time_base); r2d(ic->streams[video_stream]->time_base);

@ -796,11 +796,10 @@ bool CvCaptureCAM_V4L::open(int _index)
name = cv::format("/dev/video%d", _index); name = cv::format("/dev/video%d", _index);
} }
/* Print the CameraNumber at the end of the string with a width of one character */
bool res = open(name.c_str()); bool res = open(name.c_str());
if (!res) if (!res)
{ {
fprintf(stderr, "VIDEOIO ERROR: V4L: can't open camera by index %d\n", _index); CV_LOG_WARNING(NULL, cv::format("VIDEOIO ERROR: V4L: can't open camera by index %d", _index));
} }
return res; return res;
} }

@ -84,7 +84,7 @@ public:
{ {
if (!videoio_registry::hasBackend(apiPref)) if (!videoio_registry::hasBackend(apiPref))
throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref)); throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265")) if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265" || ext == "mpg"))
throw SkipTestException("Unstable MSMF test"); throw SkipTestException("Unstable MSMF test");
writeVideo(); writeVideo();
VideoCapture cap; VideoCapture cap;
@ -172,7 +172,7 @@ public:
{ {
if (!videoio_registry::hasBackend(apiPref)) if (!videoio_registry::hasBackend(apiPref))
throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref)); throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref));
if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265")) if (cvtest::skipUnstableTests && apiPref == CAP_MSMF && (ext == "h264" || ext == "h265" || ext == "mpg"))
throw SkipTestException("Unstable MSMF test"); throw SkipTestException("Unstable MSMF test");
VideoCapture cap; VideoCapture cap;
EXPECT_NO_THROW(cap.open(video_file, apiPref)); EXPECT_NO_THROW(cap.open(video_file, apiPref));

Loading…
Cancel
Save