Merge remote-tracking branch 'upstream/3.4' into merge-3.4

pull/13743/head
Alexander Alekhin 6 years ago
commit 665408e57f
  1. 18
      modules/dnn/src/dnn.cpp
  2. 21
      modules/dnn/src/layers/blank_layer.cpp
  3. 36
      modules/dnn/src/layers/convolution_layer.cpp
  4. 8
      modules/dnn/src/layers/pooling_layer.cpp
  5. 36
      modules/dnn/src/op_inf_engine.cpp
  6. 4
      modules/dnn/src/op_inf_engine.hpp
  7. 90
      modules/dnn/test/test_layers.cpp
  8. 34
      modules/dnn/test/test_misc.cpp
  9. 2
      modules/dnn/test/test_onnx_importer.cpp
  10. 4
      modules/dnn/test/test_tf_importer.cpp
  11. 23
      modules/imgproc/perf/perf_blur.cpp
  12. 223
      modules/imgproc/src/blend.cpp
  13. 92
      modules/imgproc/src/median_blur.cpp
  14. 3
      modules/imgproc/src/pyramids.cpp
  15. 258
      modules/imgproc/src/spatialgradient.cpp
  16. 9
      modules/imgproc/test/test_filter.cpp
  17. 9
      modules/js/src/core_bindings.cpp
  18. 11
      modules/ml/src/svm.cpp
  19. 45
      modules/ml/test/test_svmtrainauto.cpp
  20. 4
      samples/dnn/tf_text_graph_common.py

@ -148,7 +148,13 @@ private:
#else #else
cv::dnn::Net net; cv::dnn::Net net;
cv::dnn::LayerParams lp; cv::dnn::LayerParams lp;
net.addLayerToPrev("testLayer", "Identity", lp); lp.set("kernel_size", 1);
lp.set("num_output", 1);
lp.set("bias_term", false);
lp.type = "Convolution";
lp.name = "testLayer";
lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
net.addLayerToPrev(lp.name, lp.type, lp);
net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE); net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
net.setPreferableTarget(target); net.setPreferableTarget(target);
static int inpDims[] = {1, 2, 3, 4}; static int inpDims[] = {1, 2, 3, 4};
@ -2676,7 +2682,7 @@ Net Net::readFromModelOptimizer(const String& xml, const String& bin)
backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet)); backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
for (auto& it : ieNet.getOutputsInfo()) for (auto& it : ieNet.getOutputsInfo())
{ {
Ptr<Layer> cvLayer(new InfEngineBackendLayer(it.second)); Ptr<Layer> cvLayer(new InfEngineBackendLayer(ieNet));
InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str()); InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
CV_Assert(ieLayer); CV_Assert(ieLayer);
@ -2871,8 +2877,7 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
std::vector<LayerPin> pins; std::vector<LayerPin> pins;
for (int i = 0; i < outBlobNames.size(); i++) for (int i = 0; i < outBlobNames.size(); i++)
{ {
std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]); pins.push_back(impl->getPinByAlias(outBlobNames[i]));
pins.insert(pins.end(), lp.begin(), lp.end());
} }
impl->setUpNet(pins); impl->setUpNet(pins);
@ -2885,9 +2890,10 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
for (int i = 0; i < outBlobNames.size(); i++) for (int i = 0; i < outBlobNames.size(); i++)
{ {
std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]); std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
for (int i = 0; i < lp.size(); i++) outputBlobs[i].resize(lp.size());
for (int j = 0; j < lp.size(); j++)
{ {
outputBlobs[i].push_back(impl->getBlob(lp[i])); outputBlobs[i][j] = impl->getBlob(lp[j]);
} }
} }
} }

@ -110,14 +110,25 @@ public:
virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
{ {
#ifdef HAVE_INF_ENGINE #ifdef HAVE_INF_ENGINE
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
CV_Assert(!input->dims.empty());
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
InferenceEngine::Builder::SplitLayer ieLayer(name); InferenceEngine::Builder::Layer ieLayer(name);
ieLayer.setOutputPorts({InferenceEngine::Port()}); ieLayer.setName(name);
if (preferableTarget == DNN_TARGET_MYRIAD)
{
ieLayer.setType("Copy");
}
else
{
ieLayer.setType("Split");
ieLayer.getParameters()["axis"] = input->dims.size() - 1;
ieLayer.getParameters()["out_sizes"] = input->dims[0];
}
ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(1));
ieLayer.setOutputPorts(std::vector<InferenceEngine::Port>(1));
return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer)); return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
#else #else
InferenceEngine::DataPtr input = infEngineDataNode(inputs[0]);
CV_Assert(!input->dims.empty());
InferenceEngine::LayerParams lp; InferenceEngine::LayerParams lp;
lp.name = name; lp.name = name;
lp.type = "Split"; lp.type = "Split";

@ -281,7 +281,7 @@ public:
const int outCn = blobs[0].size[0]; const int outCn = blobs[0].size[0];
// prepare weightsMat where each row is aligned and has enough zero padding on the right to // prepare weightsMat where each row is aligned and has enough zero padding on the right to
// use vectorized (i.e. with intrinsics) loops without tail processing // use vectorized (i.e. with intrinsics) loops without tail processing
Mat wm = blobs[0].reshape(1, outCn).clone(); Mat wm = blobs[0].reshape(1, outCn);
if( wm.step1() % VEC_ALIGN != 0 ) if( wm.step1() % VEC_ALIGN != 0 )
{ {
int newcols = (int)alignSize(wm.step1(), VEC_ALIGN); int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
@ -374,6 +374,10 @@ public:
if (!w.empty()) if (!w.empty())
{ {
// Keep origin weights unchanged.
if (weightsMat.data == blobs[0].data)
weightsMat = weightsMat.clone();
Mat originWeights = blobs[0].reshape(1, outCn); Mat originWeights = blobs[0].reshape(1, outCn);
for (int i = 0; i < outCn; ++i) for (int i = 0; i < outCn; ++i)
{ {
@ -551,13 +555,13 @@ public:
#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5) #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R5)
InferenceEngine::Builder::ConvolutionLayer ieLayer(name); InferenceEngine::Builder::ConvolutionLayer ieLayer(name);
ieLayer.setKernel({kernel.height, kernel.width}); ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
ieLayer.setStrides({stride.height, stride.width}); ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
ieLayer.setDilation({dilation.height, dilation.width}); ieLayer.setDilation({(size_t)dilation.height, (size_t)dilation.width});
ieLayer.setPaddingsBegin({pad.height, pad.width}); ieLayer.setPaddingsBegin({(size_t)pad.height, (size_t)pad.width});
ieLayer.setPaddingsEnd({pad.height, pad.width}); ieLayer.setPaddingsEnd({(size_t)pad.height, (size_t)pad.width});
ieLayer.setGroup(group); ieLayer.setGroup((size_t)group);
ieLayer.setOutDepth(outCn); ieLayer.setOutDepth((size_t)outCn);
ieLayer.setWeights(ieWeights); ieLayer.setWeights(ieWeights);
if (ieBiases) if (ieBiases)
@ -1220,7 +1224,7 @@ public:
#ifdef HAVE_INF_ENGINE #ifdef HAVE_INF_ENGINE
if (backendId == DNN_BACKEND_INFERENCE_ENGINE) if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
{ {
if (INF_ENGINE_RELEASE == 2018050000 && (adjustPad.height || adjustPad.width)) if (INF_ENGINE_RELEASE >= 2018050000 && (adjustPad.height || adjustPad.width))
return false; return false;
const int outGroupCn = blobs[0].size[1]; // Weights are in IOHW layout const int outGroupCn = blobs[0].size[1]; // Weights are in IOHW layout
@ -1783,13 +1787,13 @@ public:
InferenceEngine::Builder::DeconvolutionLayer ieLayer(name); InferenceEngine::Builder::DeconvolutionLayer ieLayer(name);
ieLayer.setKernel({kernel.height, kernel.width}); ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
ieLayer.setStrides({stride.height, stride.width}); ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
ieLayer.setDilation({dilation.height, dilation.width}); ieLayer.setDilation({(size_t)dilation.height, (size_t)dilation.width});
ieLayer.setPaddingsBegin({pad.height, pad.width}); ieLayer.setPaddingsBegin({(size_t)pad.height, (size_t)pad.width});
ieLayer.setPaddingsEnd({pad.height, pad.width}); ieLayer.setPaddingsEnd({(size_t)pad.height, (size_t)pad.width});
ieLayer.setGroup(group); ieLayer.setGroup((size_t)group);
ieLayer.setOutDepth(numOutput); ieLayer.setOutDepth((size_t)numOutput);
ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW)); ieLayer.setWeights(wrapToInfEngineBlob(blobs[0], InferenceEngine::Layout::OIHW));
if (hasBias()) if (hasBias())

@ -299,10 +299,10 @@ public:
if (type == MAX || type == AVE) if (type == MAX || type == AVE)
{ {
InferenceEngine::Builder::PoolingLayer ieLayer(name); InferenceEngine::Builder::PoolingLayer ieLayer(name);
ieLayer.setKernel({kernel.height, kernel.width}); ieLayer.setKernel({(size_t)kernel.height, (size_t)kernel.width});
ieLayer.setStrides({stride.height, stride.width}); ieLayer.setStrides({(size_t)stride.height, (size_t)stride.width});
ieLayer.setPaddingsBegin({pad_t, pad_l}); ieLayer.setPaddingsBegin({(size_t)pad_t, (size_t)pad_l});
ieLayer.setPaddingsEnd({pad_b, pad_r}); ieLayer.setPaddingsEnd({(size_t)pad_b, (size_t)pad_r});
ieLayer.setPoolingType(type == MAX ? ieLayer.setPoolingType(type == MAX ?
InferenceEngine::Builder::PoolingLayer::PoolingType::MAX : InferenceEngine::Builder::PoolingLayer::PoolingType::MAX :
InferenceEngine::Builder::PoolingLayer::PoolingType::AVG); InferenceEngine::Builder::PoolingLayer::PoolingType::AVG);

@ -82,7 +82,7 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
CV_Assert(it != layers.end()); CV_Assert(it != layers.end());
const int layerId = it->second; const int layerId = it->second;
for (int i = 0; i < inpWrappers.size(); ++i) for (size_t i = 0; i < inpWrappers.size(); ++i)
{ {
const auto& inp = inpWrappers[i]; const auto& inp = inpWrappers[i];
const std::string& inpName = inp->dataPtr->name; const std::string& inpName = inp->dataPtr->name;
@ -103,7 +103,7 @@ void InfEngineBackendNet::connect(const std::vector<Ptr<BackendWrapper> >& input
else else
inpId = it->second; inpId = it->second;
netBuilder.connect(inpId, {layerId, i}); netBuilder.connect((size_t)inpId, {(size_t)layerId, i});
unconnectedLayersIds.erase(inpId); unconnectedLayersIds.erase(inpId);
} }
CV_Assert(!outputs.empty()); CV_Assert(!outputs.empty());
@ -119,7 +119,7 @@ void InfEngineBackendNet::init(int targetId)
for (int id : unconnectedLayersIds) for (int id : unconnectedLayersIds)
{ {
InferenceEngine::Builder::OutputLayer outLayer("myconv1"); InferenceEngine::Builder::OutputLayer outLayer("myconv1");
netBuilder.addLayer({id}, outLayer); netBuilder.addLayer({InferenceEngine::PortInfo(id)}, outLayer);
} }
cnn = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(netBuilder.build())); cnn = InferenceEngine::CNNNetwork(InferenceEngine::Builder::convertToICNNNetwork(netBuilder.build()));
} }
@ -718,19 +718,33 @@ Mat infEngineBlobToMat(const InferenceEngine::Blob::Ptr& blob)
return Mat(size, CV_32F, (void*)blob->buffer()); return Mat(size, CV_32F, (void*)blob->buffer());
} }
InfEngineBackendLayer::InfEngineBackendLayer(const InferenceEngine::DataPtr& output_)
{
output = output_;
}
bool InfEngineBackendLayer::getMemoryShapes(const std::vector<MatShape> &inputs, bool InfEngineBackendLayer::getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs, const int requiredOutputs,
std::vector<MatShape> &outputs, std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const std::vector<MatShape> &internals) const
{ {
std::vector<size_t> dims = output->dims; InferenceEngine::ICNNNetwork::InputShapes inShapes = t_net.getInputShapes();
std::vector<int> shape(dims.rbegin(), dims.rend()); InferenceEngine::ICNNNetwork::InputShapes::iterator itr;
outputs.assign(1, shape); bool equal_flag = true;
size_t i = 0;
for (itr = inShapes.begin(); itr != inShapes.end(); ++itr)
{
InferenceEngine::SizeVector currentInShape(inputs[i].begin(), inputs[i].end());
if (itr->second != currentInShape)
{
itr->second = currentInShape;
equal_flag = false;
}
i++;
}
if (!equal_flag)
{
InferenceEngine::CNNNetwork curr_t_net(t_net);
curr_t_net.reshape(inShapes);
}
std::vector<size_t> dims = t_net.getOutputsInfo()[name]->getDims();
outputs.push_back(MatShape(dims.begin(), dims.end()));
return false; return false;
} }

@ -260,7 +260,7 @@ InferenceEngine::TBlob<int16_t>::Ptr convertFp16(const InferenceEngine::Blob::Pt
class InfEngineBackendLayer : public Layer class InfEngineBackendLayer : public Layer
{ {
public: public:
InfEngineBackendLayer(const InferenceEngine::DataPtr& output); InfEngineBackendLayer(const InferenceEngine::CNNNetwork &t_net_) : t_net(t_net_) {};
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs, virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs, const int requiredOutputs,
@ -273,7 +273,7 @@ public:
virtual bool supportBackend(int backendId) CV_OVERRIDE; virtual bool supportBackend(int backendId) CV_OVERRIDE;
private: private:
InferenceEngine::DataPtr output; InferenceEngine::CNNNetwork t_net;
}; };
#endif // HAVE_INF_ENGINE #endif // HAVE_INF_ENGINE

@ -236,6 +236,10 @@ TEST_P(Test_Caffe_layers, Dropout)
TEST_P(Test_Caffe_layers, Concat) TEST_P(Test_Caffe_layers, Concat)
{ {
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE > 2018050000
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
throw SkipTestException("");
#endif
testLayerUsingCaffeModels("layer_concat"); testLayerUsingCaffeModels("layer_concat");
testLayerUsingCaffeModels("layer_concat_optim", true, false); testLayerUsingCaffeModels("layer_concat_optim", true, false);
testLayerUsingCaffeModels("layer_concat_shared_input", true, false); testLayerUsingCaffeModels("layer_concat_shared_input", true, false);
@ -923,8 +927,9 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
{ {
Target targetId = GetParam(); Target targetId = GetParam();
std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt")); Net netDefault = readNet(_tf("layer_convolution.caffemodel"), _tf("layer_convolution.prototxt"));
Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin")); Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
Mat inp = blobFromNPY(_tf("blob.npy")); Mat inp = blobFromNPY(_tf("blob.npy"));
@ -935,22 +940,15 @@ TEST_P(Layer_Test_Convolution_DLDT, Accuracy)
net.setInput(inp); net.setInput(inp);
net.setPreferableTarget(targetId); net.setPreferableTarget(targetId);
if (targetId != DNN_TARGET_MYRIAD) Mat out = net.forward();
{
Mat out = net.forward();
normAssert(outDefault, out); double l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.4e-3 : 1e-5;
double lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? 1.8e-2 : 1e-4;
normAssert(outDefault, out, "", l1, lInf);
std::vector<int> outLayers = net.getUnconnectedOutLayers(); std::vector<int> outLayers = net.getUnconnectedOutLayers();
ASSERT_EQ(net.getLayer(outLayers[0])->name, "output_merge"); ASSERT_EQ(net.getLayer(outLayers[0])->name, "output");
ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat"); ASSERT_EQ(net.getLayer(outLayers[0])->type, "Convolution");
}
else
{
// An assertion is expected because the model is in FP32 format but
// Myriad plugin supports only FP16 models.
ASSERT_ANY_THROW(net.forward());
}
} }
TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8) TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
@ -962,23 +960,16 @@ TEST_P(Layer_Test_Convolution_DLDT, setInput_uint8)
randu(inputs[0], 0, 255); randu(inputs[0], 0, 255);
inputs[0].convertTo(inputs[1], CV_32F); inputs[0].convertTo(inputs[1], CV_32F);
std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
Mat outs[2]; Mat outs[2];
for (int i = 0; i < 2; ++i) for (int i = 0; i < 2; ++i)
{ {
Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin")); Net net = readNet(_tf("layer_convolution" + suffix + ".xml"), _tf("layer_convolution" + suffix + ".bin"));
net.setPreferableTarget(targetId); net.setPreferableTarget(targetId);
net.setInput(inputs[i]); net.setInput(inputs[i]);
if (targetId != DNN_TARGET_MYRIAD) outs[i] = net.forward();
{ ASSERT_EQ(outs[i].type(), CV_32F);
outs[i] = net.forward();
ASSERT_EQ(outs[i].type(), CV_32F);
}
else
{
// An assertion is expected because the model is in FP32 format but
// Myriad plugin supports only FP16 models.
ASSERT_ANY_THROW(net.forward());
}
} }
if (targetId != DNN_TARGET_MYRIAD) if (targetId != DNN_TARGET_MYRIAD)
normAssert(outs[0], outs[1]); normAssert(outs[0], outs[1]);
@ -1008,8 +999,8 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_Test_Convolution_DLDT,
// net.save('/path/to/caffemodel') // net.save('/path/to/caffemodel')
// //
// 3. Convert using ModelOptimizer. // 3. Convert using ModelOptimizer.
typedef testing::TestWithParam<tuple<int, int, Target> > Test_DLDT_two_inputs; typedef testing::TestWithParam<tuple<int, int, Target, std::vector<int> > > Test_DLDT_two_inputs_3dim;
TEST_P(Test_DLDT_two_inputs, as_IR) TEST_P(Test_DLDT_two_inputs_3dim, as_IR)
{ {
int firstInpType = get<0>(GetParam()); int firstInpType = get<0>(GetParam());
int secondInpType = get<1>(GetParam()); int secondInpType = get<1>(GetParam());
@ -1020,32 +1011,39 @@ TEST_P(Test_DLDT_two_inputs, as_IR)
throw SkipTestException("Test is enabled starts from OpenVINO 2018R4"); throw SkipTestException("Test is enabled starts from OpenVINO 2018R4");
#endif #endif
Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin")); std::string suffix = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) ? "_fp16" : "";
int inpSize[] = {1, 2, 3}; Net net = readNet(_tf("net_two_inputs" + suffix + ".xml"), _tf("net_two_inputs.bin"));
Mat firstInp(3, &inpSize[0], firstInpType); std::vector<int> inpSize = get<3>(GetParam());
Mat secondInp(3, &inpSize[0], secondInpType); Mat firstInp(3, inpSize.data(), firstInpType);
Mat secondInp(3, inpSize.data(), secondInpType);
randu(firstInp, 0, 255); randu(firstInp, 0, 255);
randu(secondInp, 0, 255); randu(secondInp, 0, 255);
net.setInput(firstInp, "data"); net.setInput(firstInp, "data");
net.setInput(secondInp, "second_input"); net.setInput(secondInp, "second_input");
net.setPreferableTarget(targetId); net.setPreferableTarget(targetId);
if (targetId != DNN_TARGET_MYRIAD)
{
Mat out = net.forward();
Mat ref; double l1 = ((targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) &&
cv::add(firstInp, secondInp, ref, Mat(), CV_32F); (firstInpType == CV_32F || secondInpType == CV_32F)) ? 0.06 : 0.0;
normAssert(out, ref); double lInf = ((targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) &&
} (firstInpType == CV_32F || secondInpType == CV_32F)) ? 0.23 : 0.0;
else
{ Mat out = net.forward();
// An assertion is expected because the model is in FP32 format but
// Myriad plugin supports only FP16 models. Mat ref;
ASSERT_ANY_THROW(net.forward()); cv::add(firstInp, secondInp, ref, Mat(), CV_32F);
} normAssert(out, ref, "", l1, lInf);
} }
std::vector< std::vector<int> > list_sizes{ {1, 2, 3}, {3, 2, 1}, {5, 5, 5}, {13, 7, 11} };
INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_DLDT_two_inputs_3dim, Combine(
Values(CV_8U, CV_32F), Values(CV_8U, CV_32F),
testing::ValuesIn(getAvailableTargets(DNN_BACKEND_INFERENCE_ENGINE)),
testing::ValuesIn(list_sizes)
));
typedef testing::TestWithParam<tuple<int, int, Target> > Test_DLDT_two_inputs;
TEST_P(Test_DLDT_two_inputs, as_backend) TEST_P(Test_DLDT_two_inputs, as_backend)
{ {
static const float kScale = 0.5f; static const float kScale = 0.5f;

@ -308,4 +308,38 @@ TEST_P(DeprecatedForward, CustomLayerWithFallback)
INSTANTIATE_TEST_CASE_P(/**/, DeprecatedForward, dnnBackendsAndTargets()); INSTANTIATE_TEST_CASE_P(/**/, DeprecatedForward, dnnBackendsAndTargets());
TEST(Net, forwardAndRetrieve)
{
std::string prototxt =
"input: \"data\"\n"
"layer {\n"
" name: \"testLayer\"\n"
" type: \"Slice\"\n"
" bottom: \"data\"\n"
" top: \"firstCopy\"\n"
" top: \"secondCopy\"\n"
" slice_param {\n"
" axis: 0\n"
" slice_point: 2\n"
" }\n"
"}";
Net net = readNetFromCaffe(&prototxt[0], prototxt.size());
net.setPreferableBackend(DNN_BACKEND_OPENCV);
Mat inp(4, 5, CV_32F);
randu(inp, -1, 1);
net.setInput(inp);
std::vector<String> outNames;
outNames.push_back("testLayer");
std::vector<std::vector<Mat> > outBlobs;
net.forward(outBlobs, outNames);
EXPECT_EQ(outBlobs.size(), 1);
EXPECT_EQ(outBlobs[0].size(), 2);
normAssert(outBlobs[0][0], inp.rowRange(0, 2), "first part");
normAssert(outBlobs[0][1], inp.rowRange(2, 4), "second part");
}
}} // namespace }} // namespace

@ -395,7 +395,7 @@ TEST_P(Test_ONNX_nets, DenseNet121)
TEST_P(Test_ONNX_nets, Inception_v1) TEST_P(Test_ONNX_nets, Inception_v1)
{ {
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
throw SkipTestException("Test is disabled for OpenVINO 2018R5"); throw SkipTestException("Test is disabled for OpenVINO 2018R5");
#endif #endif

@ -241,7 +241,7 @@ TEST_P(Test_TensorFlow_layers, unfused_flatten)
TEST_P(Test_TensorFlow_layers, leaky_relu) TEST_P(Test_TensorFlow_layers, leaky_relu)
{ {
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL)
throw SkipTestException(""); throw SkipTestException("");
#endif #endif
@ -388,7 +388,7 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN)
TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN) TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
{ {
#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE == 2018050000 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_RELEASE >= 2018050000
if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16)) if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
throw SkipTestException("Unstable test case"); throw SkipTestException("Unstable test case");
#endif #endif

@ -230,4 +230,27 @@ PERF_TEST_P(Size_MatType_BorderType, blur5x5,
SANITY_CHECK(dst, 1); SANITY_CHECK(dst, 1);
} }
///////////// BlendLinear ////////////////////////
PERF_TEST_P(Size_MatType, BlendLinear,
testing::Combine(
testing::Values(szVGA, sz720p, sz1080p, sz2160p),
testing::Values(CV_8UC1, CV_32FC1, CV_8UC3, CV_32FC3, CV_8UC4, CV_32FC4)
)
)
{
const Size srcSize = get<0>(GetParam());
const int srcType = get<1>(GetParam());
Mat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType);
Mat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);
declare.in(src1, src2, WARMUP_RNG).in(weights1, weights2, WARMUP_READ).out(dst);
randu(weights1, 0, 1);
randu(weights2, 0, 1);
TEST_CYCLE() blendLinear(src1, src2, weights1, weights2, dst);
SANITY_CHECK_NOTHING();
}
} // namespace } // namespace

@ -48,44 +48,44 @@
#include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/hal/intrin.hpp"
namespace cv { namespace cv {
#if CV_SIMD128 #if CV_SIMD
static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2) static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const v_float32& v_w1, const v_float32& v_w2)
{ {
const v_float32x4 v_eps = v_setall_f32(1e-5f); const v_float32 v_eps = vx_setall_f32(1e-5f);
v_float32x4 v_denom = v_w1 + v_w2 + v_eps; v_float32 v_denom = v_w1 + v_w2 + v_eps;
return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom; return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
} }
static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const float* w_ptr1, const float* w_ptr2, int offset) static inline v_float32 blend(const v_float32& v_src1, const v_float32& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
{ {
v_float32x4 v_w1 = v_load(w_ptr1 + offset); v_float32 v_w1 = vx_load(w_ptr1 + offset);
v_float32x4 v_w2 = v_load(w_ptr2 + offset); v_float32 v_w2 = vx_load(w_ptr2 + offset);
return blend(v_src1, v_src2, v_w1, v_w2); return blend(v_src1, v_src2, v_w1, v_w2);
} }
static inline v_uint32x4 saturate_f32_u32(const v_float32x4& vec) static inline v_uint32 saturate_f32_u32(const v_float32& vec)
{ {
const v_int32x4 z = v_setzero_s32(); const v_int32 z = vx_setzero_s32();
const v_int32x4 x = v_setall_s32(255); const v_int32 x = vx_setall_s32(255);
return v_reinterpret_as_u32(v_min(v_max(v_round(vec), z), x)); return v_reinterpret_as_u32(v_min(v_max(v_round(vec), z), x));
} }
static inline v_uint8x16 pack_f32tou8(v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3) static inline v_uint8 pack_f32tou8(v_float32& val0, v_float32& val1, v_float32& val2, v_float32& val3)
{ {
v_uint32x4 a = saturate_f32_u32(val0); v_uint32 a = saturate_f32_u32(val0);
v_uint32x4 b = saturate_f32_u32(val1); v_uint32 b = saturate_f32_u32(val1);
v_uint32x4 c = saturate_f32_u32(val2); v_uint32 c = saturate_f32_u32(val2);
v_uint32x4 d = saturate_f32_u32(val3); v_uint32 d = saturate_f32_u32(val3);
v_uint16x8 e = v_pack(a, b); v_uint16 e = v_pack(a, b);
v_uint16x8 f = v_pack(c, d); v_uint16 f = v_pack(c, d);
return v_pack(e, f); return v_pack(e, f);
} }
static inline void store_pack_f32tou8(uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3) static inline void store_pack_f32tou8(uchar* ptr, v_float32& val0, v_float32& val1, v_float32& val2, v_float32& val3)
{ {
v_store((ptr), pack_f32tou8(val0, val1, val2, val3)); v_store((ptr), pack_f32tou8(val0, val1, val2, val3));
} }
static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3) static inline void expand_u8tof32(const v_uint8& src, v_float32& dst0, v_float32& dst1, v_float32& dst2, v_float32& dst3)
{ {
v_uint16x8 a0, a1; v_uint16 a0, a1;
v_expand(src, a0, a1); v_expand(src, a0, a1);
v_uint32x4 b0, b1,b2,b3; v_uint32 b0, b1,b2,b3;
v_expand(a0, b0, b1); v_expand(a0, b0, b1);
v_expand(a1, b2, b3); v_expand(a1, b2, b3);
dst0 = v_cvt_f32(v_reinterpret_as_s32(b0)); dst0 = v_cvt_f32(v_reinterpret_as_s32(b0));
@ -93,71 +93,69 @@ static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_fl
dst2 = v_cvt_f32(v_reinterpret_as_s32(b2)); dst2 = v_cvt_f32(v_reinterpret_as_s32(b2));
dst3 = v_cvt_f32(v_reinterpret_as_s32(b3)); dst3 = v_cvt_f32(v_reinterpret_as_s32(b3));
} }
static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3) static inline void load_expand_u8tof32(const uchar* ptr, v_float32& dst0, v_float32& dst1, v_float32& dst2, v_float32& dst3)
{ {
v_uint8x16 a = v_load((ptr)); v_uint8 a = vx_load((ptr));
expand_u8tof32(a, dst0, dst1, dst2, dst3); expand_u8tof32(a, dst0, dst1, dst2, dst3);
} }
int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn); int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn);
int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn); int blendLinearSimd(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn);
int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn) int blendLinearSimd(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn)
{ {
int step = v_uint8x16::nlanes * cn;
int weight_step = v_uint8x16::nlanes;
switch(cn) switch(cn)
{ {
case 1: case 1:
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step) for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_uint8::nlanes)
{ {
v_float32x4 v_src10, v_src11, v_src12, v_src13; v_float32 v_src10, v_src11, v_src12, v_src13;
v_float32x4 v_src20, v_src21, v_src22, v_src23; v_float32 v_src20, v_src21, v_src22, v_src23;
load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13); load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23); load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
v_float32x4 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset); v_float32 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
v_float32x4 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + 4); v_float32 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + v_float32::nlanes);
v_float32x4 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 8); v_float32 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 2*v_float32::nlanes);
v_float32x4 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 12); v_float32 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 3*v_float32::nlanes);
store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
} }
break; break;
case 2: case 2:
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step) for(int weight_offset = 0 ; x <= width - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
{ {
v_uint8x16 v_src10, v_src11, v_src20, v_src21; v_uint8 v_src10, v_src11, v_src20, v_src21;
v_load_deinterleave(src1 + x, v_src10, v_src11); v_load_deinterleave(src1 + x, v_src10, v_src11);
v_load_deinterleave(src2 + x, v_src20, v_src21); v_load_deinterleave(src2 + x, v_src20, v_src21);
v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113; v_float32 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213; v_float32 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103); expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113); expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203); expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203);
expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213); expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
v_float32x4 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset); v_float32 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
v_float32x4 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset); v_float32 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
v_float32x4 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + 4); v_float32 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + v_float32::nlanes);
v_float32x4 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + 4); v_float32 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + v_float32::nlanes);
v_float32x4 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 8); v_float32 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 2*v_float32::nlanes);
v_float32x4 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 8); v_float32 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 2*v_float32::nlanes);
v_float32x4 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 12); v_float32 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 3*v_float32::nlanes);
v_float32x4 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 12); v_float32 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 3*v_float32::nlanes);
v_uint8x16 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6); v_uint8 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
v_uint8x16 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7); v_uint8 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
v_store_interleave(dst + x, v_dsta, v_dstb); v_store_interleave(dst + x, v_dsta, v_dstb);
} }
break; break;
case 3: case 3:
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step) for(int weight_offset = 0 ; x <= width - 3*v_uint8::nlanes; x += 3*v_uint8::nlanes, weight_offset += v_uint8::nlanes)
{ {
v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; v_uint8 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12); v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22); v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123; v_float32 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223; v_float32 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103); expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113); expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
expand_u8tof32(v_src12, v_src120, v_src121, v_src122, v_src123); expand_u8tof32(v_src12, v_src120, v_src121, v_src122, v_src123);
@ -165,14 +163,14 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213); expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223); expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);
v_float32x4 v_w10 = v_load(weights1 + weight_offset); v_float32 v_w10 = vx_load(weights1 + weight_offset);
v_float32x4 v_w11 = v_load(weights1 + weight_offset + 4); v_float32 v_w11 = vx_load(weights1 + weight_offset + v_float32::nlanes);
v_float32x4 v_w12 = v_load(weights1 + weight_offset + 8); v_float32 v_w12 = vx_load(weights1 + weight_offset + 2*v_float32::nlanes);
v_float32x4 v_w13 = v_load(weights1 + weight_offset + 12); v_float32 v_w13 = vx_load(weights1 + weight_offset + 3*v_float32::nlanes);
v_float32x4 v_w20 = v_load(weights2 + weight_offset); v_float32 v_w20 = vx_load(weights2 + weight_offset);
v_float32x4 v_w21 = v_load(weights2 + weight_offset + 4); v_float32 v_w21 = vx_load(weights2 + weight_offset + v_float32::nlanes);
v_float32x4 v_w22 = v_load(weights2 + weight_offset + 8); v_float32 v_w22 = vx_load(weights2 + weight_offset + 2*v_float32::nlanes);
v_float32x4 v_w23 = v_load(weights2 + weight_offset + 12); v_float32 v_w23 = vx_load(weights2 + weight_offset + 3*v_float32::nlanes);
v_src100 = blend(v_src100, v_src200, v_w10, v_w20); v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
v_src110 = blend(v_src110, v_src210, v_w10, v_w20); v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
v_src120 = blend(v_src120, v_src220, v_w10, v_w20); v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
@ -187,34 +185,36 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
v_src123 = blend(v_src123, v_src223, v_w13, v_w23); v_src123 = blend(v_src123, v_src223, v_w13, v_w23);
v_uint8x16 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103); v_uint8 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103);
v_uint8x16 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113); v_uint8 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113);
v_uint8x16 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123); v_uint8 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123);
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2); v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
} }
break; break;
case 4: case 4:
step = v_uint8x16::nlanes; for(int weight_offset = 0 ; x <= width - v_uint8::nlanes; x += v_uint8::nlanes, weight_offset += v_float32::nlanes)
weight_step = v_float32x4::nlanes;
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
{ {
v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17; v_float32 v_src10, v_src11, v_src12, v_src13;
v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27; v_float32 v_src20, v_src21, v_src22, v_src23;
load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13); load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23); load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17); v_float32 v_w10, v_w11, v_w12, v_w13, v_w20, v_w21, v_w22, v_w23, v_w0, v_w1;
v_transpose4x4(v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27); v_w10 = vx_load(weights1 + weight_offset);
v_zip(v_w10, v_w10, v_w0, v_w1);
v_zip(v_w0, v_w0, v_w10, v_w11);
v_zip(v_w1, v_w1, v_w12, v_w13);
v_w20 = vx_load(weights2 + weight_offset);
v_zip(v_w20, v_w20, v_w0, v_w1);
v_zip(v_w0, v_w0, v_w20, v_w21);
v_zip(v_w1, v_w1, v_w22, v_w23);
v_float32 v_dst0, v_dst1, v_dst2, v_dst3;
v_dst0 = blend(v_src10, v_src20, v_w10, v_w20);
v_dst1 = blend(v_src11, v_src21, v_w11, v_w21);
v_dst2 = blend(v_src12, v_src22, v_w12, v_w22);
v_dst3 = blend(v_src13, v_src23, v_w13, v_w23);
v_float32x4 v_w1 = v_load(weights1 + weight_offset);
v_float32x4 v_w2 = v_load(weights2 + weight_offset);
v_src10 = blend(v_src14, v_src24, v_w1, v_w2);
v_src11 = blend(v_src15, v_src25, v_w1, v_w2);
v_src12 = blend(v_src16, v_src26, v_w1, v_w2);
v_src13 = blend(v_src17, v_src27, v_w1, v_w2);
v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
} }
break; break;
@ -224,68 +224,67 @@ int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weight
return x; return x;
} }
int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn) int blendLinearSimd(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn)
{ {
int step = v_float32x4::nlanes*cn;
switch(cn) switch(cn)
{ {
case 1: case 1:
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) for(int weight_offset = 0 ; x <= width - v_float32::nlanes; x += v_float32::nlanes, weight_offset += v_float32::nlanes)
{ {
v_float32x4 v_src1 = v_load(src1 + x); v_float32 v_src1 = vx_load(src1 + x);
v_float32x4 v_src2 = v_load(src2 + x); v_float32 v_src2 = vx_load(src2 + x);
v_float32x4 v_w1 = v_load(weights1 + weight_offset); v_float32 v_w1 = vx_load(weights1 + weight_offset);
v_float32x4 v_w2 = v_load(weights2 + weight_offset); v_float32 v_w2 = vx_load(weights2 + weight_offset);
v_float32x4 v_dst = blend(v_src1, v_src2, v_w1, v_w2); v_float32 v_dst = blend(v_src1, v_src2, v_w1, v_w2);
v_store(dst + x, v_dst); v_store(dst + x, v_dst);
} }
break; break;
case 2: case 2:
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) for(int weight_offset = 0 ; x <= width - 2*v_float32::nlanes; x += 2*v_float32::nlanes, weight_offset += v_float32::nlanes)
{ {
v_float32x4 v_src10, v_src11, v_src20, v_src21; v_float32 v_src10, v_src11, v_src20, v_src21;
v_load_deinterleave(src1 + x, v_src10, v_src11); v_load_deinterleave(src1 + x, v_src10, v_src11);
v_load_deinterleave(src2 + x, v_src20, v_src21); v_load_deinterleave(src2 + x, v_src20, v_src21);
v_float32x4 v_w1 = v_load(weights1 + weight_offset); v_float32 v_w1 = vx_load(weights1 + weight_offset);
v_float32x4 v_w2 = v_load(weights2 + weight_offset); v_float32 v_w2 = vx_load(weights2 + weight_offset);
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
v_store_interleave(dst + x, v_dst0, v_dst1); v_store_interleave(dst + x, v_dst0, v_dst1);
} }
break; break;
case 3: case 3:
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) for(int weight_offset = 0 ; x <= width - 3*v_float32::nlanes; x += 3*v_float32::nlanes, weight_offset += v_float32::nlanes)
{ {
v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22; v_float32 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12); v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22); v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
v_float32x4 v_w1 = v_load(weights1 + weight_offset); v_float32 v_w1 = vx_load(weights1 + weight_offset);
v_float32x4 v_w2 = v_load(weights2 + weight_offset); v_float32 v_w2 = vx_load(weights2 + weight_offset);
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2); v_float32 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2); v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
} }
break; break;
case 4: case 4:
for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes) for(int weight_offset = 0 ; x <= width - 4*v_float32::nlanes; x += 4*v_float32::nlanes, weight_offset += v_float32::nlanes)
{ {
v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23; v_float32 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13); v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23); v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23);
v_float32x4 v_w1 = v_load(weights1 + weight_offset); v_float32 v_w1 = vx_load(weights1 + weight_offset);
v_float32x4 v_w2 = v_load(weights2 + weight_offset); v_float32 v_w2 = vx_load(weights2 + weight_offset);
v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2); v_float32 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2); v_float32 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2); v_float32 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
v_float32x4 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2); v_float32 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2);
v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3); v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
} }
@ -321,8 +320,8 @@ public:
T * const dst_row = dst->ptr<T>(y); T * const dst_row = dst->ptr<T>(y);
int x = 0; int x = 0;
#if CV_SIMD128 #if CV_SIMD
x = blendLinearSimd128(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn); x = blendLinearSimd(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
#endif #endif
for ( ; x < width; ++x) for ( ; x < width; ++x)

@ -110,15 +110,19 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2; int cn = _dst.channels(), m = _dst.rows, r = (ksize-1)/2;
CV_Assert(cn > 0 && cn <= 4); CV_Assert(cn > 0 && cn <= 4);
size_t sstep = _src.step, dstep = _dst.step; size_t sstep = _src.step, dstep = _dst.step;
Histogram CV_DECL_ALIGNED(16) H[4];
HT CV_DECL_ALIGNED(16) luc[4][16];
int STRIPE_SIZE = std::min( _dst.cols, 512/cn ); int STRIPE_SIZE = std::min( _dst.cols, 512/cn );
std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + 16); #if defined(CV_SIMD_WIDTH) && CV_SIMD_WIDTH >= 16
std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + 16); # define CV_ALIGNMENT CV_SIMD_WIDTH
HT* h_coarse = alignPtr(&_h_coarse[0], 16); #else
HT* h_fine = alignPtr(&_h_fine[0], 16); # define CV_ALIGNMENT 16
#endif
std::vector<HT> _h_coarse(1 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT);
std::vector<HT> _h_fine(16 * 16 * (STRIPE_SIZE + 2*r) * cn + CV_ALIGNMENT);
HT* h_coarse = alignPtr(&_h_coarse[0], CV_ALIGNMENT);
HT* h_fine = alignPtr(&_h_fine[0], CV_ALIGNMENT);
for( int x = 0; x < _dst.cols; x += STRIPE_SIZE ) for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
{ {
@ -148,10 +152,14 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
const uchar* p0 = src + sstep * std::max( 0, i-r-1 ); const uchar* p0 = src + sstep * std::max( 0, i-r-1 );
const uchar* p1 = src + sstep * std::min( m-1, i+r ); const uchar* p1 = src + sstep * std::min( m-1, i+r );
memset( H, 0, cn*sizeof(H[0]) );
memset( luc, 0, cn*sizeof(luc[0]) );
for( c = 0; c < cn; c++ ) for( c = 0; c < cn; c++ )
{ {
Histogram CV_DECL_ALIGNED(CV_ALIGNMENT) H;
HT CV_DECL_ALIGNED(CV_ALIGNMENT) luc[16];
memset(&H, 0, sizeof(H));
memset(luc, 0, sizeof(luc));
// Update column histograms for the entire row. // Update column histograms for the entire row.
for( j = 0; j < n; j++ ) for( j = 0; j < n; j++ )
{ {
@ -163,21 +171,21 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
for (k = 0; k < 16; ++k) for (k = 0; k < 16; ++k)
{ {
#if CV_SIMD256 #if CV_SIMD256
v_store(H[c].fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H[c].fine[k])); v_store(H.fine[k], v_mul_wrap(v256_load(h_fine + 16 * n*(16 * c + k)), v256_setall_u16(2 * r + 1)) + v256_load(H.fine[k]));
#elif CV_SIMD128 #elif CV_SIMD128
v_store(H[c].fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k])); v_store(H.fine[k], v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k)), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k]));
v_store(H[c].fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H[c].fine[k] + 8)); v_store(H.fine[k] + 8, v_mul_wrap(v_load(h_fine + 16 * n*(16 * c + k) + 8), v_setall_u16((ushort)(2 * r + 1))) + v_load(H.fine[k] + 8));
#else #else
for (int ind = 0; ind < 16; ++ind) for (int ind = 0; ind < 16; ++ind)
H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]); H.fine[k][ind] = (HT)(H.fine[k][ind] + (2 * r + 1) * h_fine[16 * n*(16 * c + k) + ind]);
#endif #endif
} }
#if CV_SIMD256 #if CV_SIMD256
v_uint16x16 v_coarse = v256_load(H[c].coarse); v_uint16x16 v_coarse = v256_load(H.coarse);
#elif CV_SIMD128 #elif CV_SIMD128
v_uint16x8 v_coarsel = v_load(H[c].coarse); v_uint16x8 v_coarsel = v_load(H.coarse);
v_uint16x8 v_coarseh = v_load(H[c].coarse + 8); v_uint16x8 v_coarseh = v_load(H.coarse + 8);
#endif #endif
HT* px = h_coarse + 16 * n*c; HT* px = h_coarse + 16 * n*c;
for( j = 0; j < 2*r; ++j, px += 16 ) for( j = 0; j < 2*r; ++j, px += 16 )
@ -189,7 +197,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
v_coarseh += v_load(px + 8); v_coarseh += v_load(px + 8);
#else #else
for (int ind = 0; ind < 16; ++ind) for (int ind = 0; ind < 16; ++ind)
H[c].coarse[ind] += px[ind]; H.coarse[ind] += px[ind];
#endif #endif
} }
@ -201,24 +209,24 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
px = h_coarse + 16 * (n*c + std::min(j + r, n - 1)); px = h_coarse + 16 * (n*c + std::min(j + r, n - 1));
#if CV_SIMD256 #if CV_SIMD256
v_coarse += v256_load(px); v_coarse += v256_load(px);
v_store(H[c].coarse, v_coarse); v_store(H.coarse, v_coarse);
#elif CV_SIMD128 #elif CV_SIMD128
v_coarsel += v_load(px); v_coarsel += v_load(px);
v_coarseh += v_load(px + 8); v_coarseh += v_load(px + 8);
v_store(H[c].coarse, v_coarsel); v_store(H.coarse, v_coarsel);
v_store(H[c].coarse + 8, v_coarseh); v_store(H.coarse + 8, v_coarseh);
#else #else
for (int ind = 0; ind < 16; ++ind) for (int ind = 0; ind < 16; ++ind)
H[c].coarse[ind] += px[ind]; H.coarse[ind] += px[ind];
#endif #endif
// Find median at coarse level // Find median at coarse level
for ( k = 0; k < 16 ; ++k ) for ( k = 0; k < 16 ; ++k )
{ {
sum += H[c].coarse[k]; sum += H.coarse[k];
if ( sum > t ) if ( sum > t )
{ {
sum -= H[c].coarse[k]; sum -= H.coarse[k];
break; break;
} }
} }
@ -231,7 +239,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
v_uint16x8 v_finel; v_uint16x8 v_finel;
v_uint16x8 v_fineh; v_uint16x8 v_fineh;
#endif #endif
if ( luc[c][k] <= j-r ) if ( luc[k] <= j-r )
{ {
#if CV_SIMD256 #if CV_SIMD256
v_fine = v256_setzero_u16(); v_fine = v256_setzero_u16();
@ -239,10 +247,10 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
v_finel = v_setzero_u16(); v_finel = v_setzero_u16();
v_fineh = v_setzero_u16(); v_fineh = v_setzero_u16();
#else #else
memset(&H[c].fine[k], 0, 16 * sizeof(HT)); memset(&H.fine[k], 0, 16 * sizeof(HT));
#endif #endif
px = h_fine + 16 * (n*(16 * c + k) + j - r); px = h_fine + 16 * (n*(16 * c + k) + j - r);
for (luc[c][k] = HT(j - r); luc[c][k] < MIN(j + r + 1, n); ++luc[c][k], px += 16) for (luc[k] = HT(j - r); luc[k] < MIN(j + r + 1, n); ++luc[k], px += 16)
{ {
#if CV_SIMD256 #if CV_SIMD256
v_fine += v256_load(px); v_fine += v256_load(px);
@ -251,11 +259,11 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
v_fineh += v_load(px + 8); v_fineh += v_load(px + 8);
#else #else
for (int ind = 0; ind < 16; ++ind) for (int ind = 0; ind < 16; ++ind)
H[c].fine[k][ind] += px[ind]; H.fine[k][ind] += px[ind];
#endif #endif
} }
if ( luc[c][k] < j+r+1 ) if ( luc[k] < j+r+1 )
{ {
px = h_fine + 16 * (n*(16 * c + k) + (n - 1)); px = h_fine + 16 * (n*(16 * c + k) + (n - 1));
#if CV_SIMD256 #if CV_SIMD256
@ -265,50 +273,50 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n))); v_fineh += v_mul_wrap(v_load(px + 8), v_setall_u16((ushort)(j + r + 1 - n)));
#else #else
for (int ind = 0; ind < 16; ++ind) for (int ind = 0; ind < 16; ++ind)
H[c].fine[k][ind] = (HT)(H[c].fine[k][ind] + (j + r + 1 - n) * px[ind]); H.fine[k][ind] = (HT)(H.fine[k][ind] + (j + r + 1 - n) * px[ind]);
#endif #endif
luc[c][k] = (HT)(j+r+1); luc[k] = (HT)(j+r+1);
} }
} }
else else
{ {
#if CV_SIMD256 #if CV_SIMD256
v_fine = v256_load(H[c].fine[k]); v_fine = v256_load(H.fine[k]);
#elif CV_SIMD128 #elif CV_SIMD128
v_finel = v_load(H[c].fine[k]); v_finel = v_load(H.fine[k]);
v_fineh = v_load(H[c].fine[k] + 8); v_fineh = v_load(H.fine[k] + 8);
#endif #endif
px = h_fine + 16*n*(16 * c + k); px = h_fine + 16*n*(16 * c + k);
for ( ; luc[c][k] < j+r+1; ++luc[c][k] ) for ( ; luc[k] < j+r+1; ++luc[k] )
{ {
#if CV_SIMD256 #if CV_SIMD256
v_fine = v_fine + v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0)); v_fine = v_fine + v256_load(px + 16 * MIN(luc[k], n - 1)) - v256_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
#elif CV_SIMD128 #elif CV_SIMD128
v_finel = v_finel + v_load(px + 16 * MIN(luc[c][k], n - 1) ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0)); v_finel = v_finel + v_load(px + 16 * MIN(luc[k], n - 1) ) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0));
v_fineh = v_fineh + v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8); v_fineh = v_fineh + v_load(px + 16 * MIN(luc[k], n - 1) + 8) - v_load(px + 16 * MAX(luc[k] - 2 * r - 1, 0) + 8);
#else #else
for (int ind = 0; ind < 16; ++ind) for (int ind = 0; ind < 16; ++ind)
H[c].fine[k][ind] += px[16 * MIN(luc[c][k], n - 1) + ind] - px[16 * MAX(luc[c][k] - 2 * r - 1, 0) + ind]; H.fine[k][ind] += px[16 * MIN(luc[k], n - 1) + ind] - px[16 * MAX(luc[k] - 2 * r - 1, 0) + ind];
#endif #endif
} }
} }
px = h_coarse + 16 * (n*c + MAX(j - r, 0)); px = h_coarse + 16 * (n*c + MAX(j - r, 0));
#if CV_SIMD256 #if CV_SIMD256
v_store(H[c].fine[k], v_fine); v_store(H.fine[k], v_fine);
v_coarse -= v256_load(px); v_coarse -= v256_load(px);
#elif CV_SIMD128 #elif CV_SIMD128
v_store(H[c].fine[k], v_finel); v_store(H.fine[k], v_finel);
v_store(H[c].fine[k] + 8, v_fineh); v_store(H.fine[k] + 8, v_fineh);
v_coarsel -= v_load(px); v_coarsel -= v_load(px);
v_coarseh -= v_load(px + 8); v_coarseh -= v_load(px + 8);
#else #else
for (int ind = 0; ind < 16; ++ind) for (int ind = 0; ind < 16; ++ind)
H[c].coarse[ind] -= px[ind]; H.coarse[ind] -= px[ind];
#endif #endif
/* Find median in segment */ /* Find median in segment */
segment = H[c].fine[k]; segment = H.fine[k];
for ( b = 0; b < 16 ; b++ ) for ( b = 0; b < 16 ; b++ )
{ {
sum += segment[b]; sum += segment[b];

@ -112,6 +112,7 @@ struct PyrDownVec_32s8u
v_rshr_pack_store<8>(dst + x, t0); v_rshr_pack_store<8>(dst + x, t0);
x += v_uint16::nlanes; x += v_uint16::nlanes;
} }
typedef int CV_DECL_ALIGNED(1) unaligned_int;
for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes) for ( ; x <= width - v_int32x4::nlanes; x += v_int32x4::nlanes)
{ {
v_int32x4 r0, r1, r2, r3, r4, t0; v_int32x4 r0, r1, r2, r3, r4, t0;
@ -122,7 +123,7 @@ struct PyrDownVec_32s8u
r4 = v_load(row4 + x); r4 = v_load(row4 + x);
t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2); t0 = r0 + r4 + (r2 + r2) + ((r1 + r3 + r2) << 2);
*(int*)(dst + x) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0(); *((unaligned_int*) (dst + x)) = v_reinterpret_as_s32(v_rshr_pack<8>(v_pack_u(t0, t0), v_setzero_u16())).get0();
} }
return x; return x;

@ -123,139 +123,125 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
} }
} }
// Pointer to row vectors
uchar *p_src, *c_src, *n_src; // previous, current, next row
short *c_dx, *c_dy;
int i_start = 0; int i_start = 0;
int j_start = 0; int j_start = 0;
#if CV_SIMD128 #if CV_SIMD
if(hasSIMD128()) // Characters in variable names have the following meanings:
// u: unsigned char
// s: signed int
//
// [row][column]
// m: offset -1
// n: offset 0
// p: offset 1
// Example: umn is offset -1 in row and offset 0 in column
for ( i = 0; i < H - 1; i += 2 )
{ {
uchar *m_src; uchar *p_src = src.ptr<uchar>(i == 0 ? i_top : i - 1);
short *n_dx, *n_dy; uchar *c_src = src.ptr<uchar>(i);
uchar *n_src = src.ptr<uchar>(i+1);
// Characters in variable names have the following meanings: uchar *m_src = src.ptr<uchar>(i == H - 2 ? i_bottom : i + 2);
// u: unsigned char
// s: signed int short *c_dx = dx.ptr<short>(i);
// short *c_dy = dy.ptr<short>(i);
// [row][column] short *n_dx = dx.ptr<short>(i+1);
// m: offset -1 short *n_dy = dy.ptr<short>(i+1);
// n: offset 0
// p: offset 1 // Process rest of columns 16-column chunks at a time
// Example: umn is offset -1 in row and offset 0 in column for ( j = 1; j < W - v_uint8::nlanes; j += v_uint8::nlanes)
for ( i = 0; i < H - 1; i += 2 )
{ {
if ( i == 0 ) p_src = src.ptr<uchar>(i_top); // Load top row for 3x3 Sobel filter
else p_src = src.ptr<uchar>(i-1); v_uint8 v_um = vx_load(&p_src[j-1]);
v_uint8 v_un = vx_load(&p_src[j]);
c_src = src.ptr<uchar>(i); v_uint8 v_up = vx_load(&p_src[j+1]);
n_src = src.ptr<uchar>(i+1); v_uint16 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2;
v_expand(v_um, v_um1, v_um2);
if ( i == H - 2 ) m_src = src.ptr<uchar>(i_bottom); v_expand(v_un, v_un1, v_un2);
else m_src = src.ptr<uchar>(i+2); v_expand(v_up, v_up1, v_up2);
v_int16 v_s1m1 = v_reinterpret_as_s16(v_um1);
c_dx = dx.ptr<short>(i); v_int16 v_s1m2 = v_reinterpret_as_s16(v_um2);
c_dy = dy.ptr<short>(i); v_int16 v_s1n1 = v_reinterpret_as_s16(v_un1);
n_dx = dx.ptr<short>(i+1); v_int16 v_s1n2 = v_reinterpret_as_s16(v_un2);
n_dy = dy.ptr<short>(i+1); v_int16 v_s1p1 = v_reinterpret_as_s16(v_up1);
v_int16 v_s1p2 = v_reinterpret_as_s16(v_up2);
// Process rest of columns 16-column chunks at a time
for ( j = 1; j < W - 16; j += 16 ) // Load second row for 3x3 Sobel filter
{ v_um = vx_load(&c_src[j-1]);
// Load top row for 3x3 Sobel filter v_un = vx_load(&c_src[j]);
v_uint8x16 v_um = v_load(&p_src[j-1]); v_up = vx_load(&c_src[j+1]);
v_uint8x16 v_un = v_load(&p_src[j]); v_expand(v_um, v_um1, v_um2);
v_uint8x16 v_up = v_load(&p_src[j+1]); v_expand(v_un, v_un1, v_un2);
v_uint16x8 v_um1, v_um2, v_un1, v_un2, v_up1, v_up2; v_expand(v_up, v_up1, v_up2);
v_expand(v_um, v_um1, v_um2); v_int16 v_s2m1 = v_reinterpret_as_s16(v_um1);
v_expand(v_un, v_un1, v_un2); v_int16 v_s2m2 = v_reinterpret_as_s16(v_um2);
v_expand(v_up, v_up1, v_up2); v_int16 v_s2n1 = v_reinterpret_as_s16(v_un1);
v_int16x8 v_s1m1 = v_reinterpret_as_s16(v_um1); v_int16 v_s2n2 = v_reinterpret_as_s16(v_un2);
v_int16x8 v_s1m2 = v_reinterpret_as_s16(v_um2); v_int16 v_s2p1 = v_reinterpret_as_s16(v_up1);
v_int16x8 v_s1n1 = v_reinterpret_as_s16(v_un1); v_int16 v_s2p2 = v_reinterpret_as_s16(v_up2);
v_int16x8 v_s1n2 = v_reinterpret_as_s16(v_un2);
v_int16x8 v_s1p1 = v_reinterpret_as_s16(v_up1); // Load third row for 3x3 Sobel filter
v_int16x8 v_s1p2 = v_reinterpret_as_s16(v_up2); v_um = vx_load(&n_src[j-1]);
v_un = vx_load(&n_src[j]);
// Load second row for 3x3 Sobel filter v_up = vx_load(&n_src[j+1]);
v_um = v_load(&c_src[j-1]); v_expand(v_um, v_um1, v_um2);
v_un = v_load(&c_src[j]); v_expand(v_un, v_un1, v_un2);
v_up = v_load(&c_src[j+1]); v_expand(v_up, v_up1, v_up2);
v_expand(v_um, v_um1, v_um2); v_int16 v_s3m1 = v_reinterpret_as_s16(v_um1);
v_expand(v_un, v_un1, v_un2); v_int16 v_s3m2 = v_reinterpret_as_s16(v_um2);
v_expand(v_up, v_up1, v_up2); v_int16 v_s3n1 = v_reinterpret_as_s16(v_un1);
v_int16x8 v_s2m1 = v_reinterpret_as_s16(v_um1); v_int16 v_s3n2 = v_reinterpret_as_s16(v_un2);
v_int16x8 v_s2m2 = v_reinterpret_as_s16(v_um2); v_int16 v_s3p1 = v_reinterpret_as_s16(v_up1);
v_int16x8 v_s2n1 = v_reinterpret_as_s16(v_un1); v_int16 v_s3p2 = v_reinterpret_as_s16(v_up2);
v_int16x8 v_s2n2 = v_reinterpret_as_s16(v_un2);
v_int16x8 v_s2p1 = v_reinterpret_as_s16(v_up1); // dx & dy for rows 1, 2, 3
v_int16x8 v_s2p2 = v_reinterpret_as_s16(v_up2); v_int16 v_sdx1, v_sdy1;
spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
// Load third row for 3x3 Sobel filter v_s1m1, v_s1n1, v_s1p1,
v_um = v_load(&n_src[j-1]); v_s2m1, v_s2p1,
v_un = v_load(&n_src[j]); v_s3m1, v_s3n1, v_s3p1 );
v_up = v_load(&n_src[j+1]);
v_expand(v_um, v_um1, v_um2); v_int16 v_sdx2, v_sdy2;
v_expand(v_un, v_un1, v_un2); spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
v_expand(v_up, v_up1, v_up2); v_s1m2, v_s1n2, v_s1p2,
v_int16x8 v_s3m1 = v_reinterpret_as_s16(v_um1); v_s2m2, v_s2p2,
v_int16x8 v_s3m2 = v_reinterpret_as_s16(v_um2); v_s3m2, v_s3n2, v_s3p2 );
v_int16x8 v_s3n1 = v_reinterpret_as_s16(v_un1);
v_int16x8 v_s3n2 = v_reinterpret_as_s16(v_un2); // Store
v_int16x8 v_s3p1 = v_reinterpret_as_s16(v_up1); v_store(&c_dx[j], v_sdx1);
v_int16x8 v_s3p2 = v_reinterpret_as_s16(v_up2); v_store(&c_dx[j+v_int16::nlanes], v_sdx2);
v_store(&c_dy[j], v_sdy1);
// dx & dy for rows 1, 2, 3 v_store(&c_dy[j+v_int16::nlanes], v_sdy2);
v_int16x8 v_sdx1, v_sdy1;
spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1, // Load fourth row for 3x3 Sobel filter
v_s1m1, v_s1n1, v_s1p1, v_um = vx_load(&m_src[j-1]);
v_s2m1, v_s2p1, v_un = vx_load(&m_src[j]);
v_s3m1, v_s3n1, v_s3p1 ); v_up = vx_load(&m_src[j+1]);
v_expand(v_um, v_um1, v_um2);
v_int16x8 v_sdx2, v_sdy2; v_expand(v_un, v_un1, v_un2);
spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2, v_expand(v_up, v_up1, v_up2);
v_s1m2, v_s1n2, v_s1p2, v_int16 v_s4m1 = v_reinterpret_as_s16(v_um1);
v_s2m2, v_s2p2, v_int16 v_s4m2 = v_reinterpret_as_s16(v_um2);
v_s3m2, v_s3n2, v_s3p2 ); v_int16 v_s4n1 = v_reinterpret_as_s16(v_un1);
v_int16 v_s4n2 = v_reinterpret_as_s16(v_un2);
// Store v_int16 v_s4p1 = v_reinterpret_as_s16(v_up1);
v_store(&c_dx[j], v_sdx1); v_int16 v_s4p2 = v_reinterpret_as_s16(v_up2);
v_store(&c_dx[j+8], v_sdx2);
v_store(&c_dy[j], v_sdy1); // dx & dy for rows 2, 3, 4
v_store(&c_dy[j+8], v_sdy2); spatialGradientKernel<v_int16>( v_sdx1, v_sdy1,
v_s2m1, v_s2n1, v_s2p1,
// Load fourth row for 3x3 Sobel filter v_s3m1, v_s3p1,
v_um = v_load(&m_src[j-1]); v_s4m1, v_s4n1, v_s4p1 );
v_un = v_load(&m_src[j]);
v_up = v_load(&m_src[j+1]); spatialGradientKernel<v_int16>( v_sdx2, v_sdy2,
v_expand(v_um, v_um1, v_um2); v_s2m2, v_s2n2, v_s2p2,
v_expand(v_un, v_un1, v_un2); v_s3m2, v_s3p2,
v_expand(v_up, v_up1, v_up2); v_s4m2, v_s4n2, v_s4p2 );
v_int16x8 v_s4m1 = v_reinterpret_as_s16(v_um1);
v_int16x8 v_s4m2 = v_reinterpret_as_s16(v_um2); // Store
v_int16x8 v_s4n1 = v_reinterpret_as_s16(v_un1); v_store(&n_dx[j], v_sdx1);
v_int16x8 v_s4n2 = v_reinterpret_as_s16(v_un2); v_store(&n_dx[j+v_int16::nlanes], v_sdx2);
v_int16x8 v_s4p1 = v_reinterpret_as_s16(v_up1); v_store(&n_dy[j], v_sdy1);
v_int16x8 v_s4p2 = v_reinterpret_as_s16(v_up2); v_store(&n_dy[j+v_int16::nlanes], v_sdy2);
// dx & dy for rows 2, 3, 4
spatialGradientKernel<v_int16x8>( v_sdx1, v_sdy1,
v_s2m1, v_s2n1, v_s2p1,
v_s3m1, v_s3p1,
v_s4m1, v_s4n1, v_s4p1 );
spatialGradientKernel<v_int16x8>( v_sdx2, v_sdy2,
v_s2m2, v_s2n2, v_s2p2,
v_s3m2, v_s3p2,
v_s4m2, v_s4n2, v_s4p2 );
// Store
v_store(&n_dx[j], v_sdx1);
v_store(&n_dx[j+8], v_sdx2);
v_store(&n_dy[j], v_sdy1);
v_store(&n_dy[j+8], v_sdy2);
}
} }
} }
i_start = i; i_start = i;
@ -265,16 +251,12 @@ void spatialGradient( InputArray _src, OutputArray _dx, OutputArray _dy,
uchar v00, v01, v02, v10, v11, v12, v20, v21, v22; uchar v00, v01, v02, v10, v11, v12, v20, v21, v22;
for ( i = 0; i < H; i++ ) for ( i = 0; i < H; i++ )
{ {
if ( i == 0 ) p_src = src.ptr<uchar>(i_top); uchar *p_src = src.ptr<uchar>(i == 0 ? i_top : i - 1);
else p_src = src.ptr<uchar>(i-1); uchar *c_src = src.ptr<uchar>(i);
uchar *n_src = src.ptr<uchar>(i == H - 1 ? i_bottom : i + 1);
c_src = src.ptr<uchar>(i);
if ( i == H - 1 ) n_src = src.ptr<uchar>(i_bottom);
else n_src = src.ptr<uchar>(i+1);
c_dx = dx.ptr<short>(i); short *c_dx = dx.ptr<short>(i);
c_dy = dy.ptr<short>(i); short *c_dy = dy.ptr<short>(i);
// Process left-most column // Process left-most column
j = 0; j = 0;

@ -2235,4 +2235,13 @@ TEST(Imgproc_Sobel, s16_regression_13506)
Sobel(src, dst, CV_16S, 0, 1, 5); Sobel(src, dst, CV_16S, 0, 1, 5);
ASSERT_EQ(0.0, cvtest::norm(dst, ref, NORM_INF)); ASSERT_EQ(0.0, cvtest::norm(dst, ref, NORM_INF));
} }
TEST(Imgproc_Pyrdown, issue_12961)
{
Mat src(9, 9, CV_8UC1, Scalar::all(0));
Mat dst;
cv::pyrDown(src, dst);
ASSERT_EQ(0.0, cv::norm(dst));
}
}} // namespace }} // namespace

@ -341,6 +341,9 @@ EMSCRIPTEN_BINDINGS(binding_utils)
register_vector<cv::Mat>("MatVector"); register_vector<cv::Mat>("MatVector");
register_vector<cv::Rect>("RectVector"); register_vector<cv::Rect>("RectVector");
register_vector<cv::KeyPoint>("KeyPointVector"); register_vector<cv::KeyPoint>("KeyPointVector");
register_vector<cv::DMatch>("DMatchVector");
register_vector<std::vector<cv::DMatch>>("DMatchVectorVector");
emscripten::class_<cv::Mat>("Mat") emscripten::class_<cv::Mat>("Mat")
.constructor<>() .constructor<>()
@ -494,6 +497,12 @@ EMSCRIPTEN_BINDINGS(binding_utils)
.field("response", &cv::KeyPoint::response) .field("response", &cv::KeyPoint::response)
.field("size", &cv::KeyPoint::size); .field("size", &cv::KeyPoint::size);
emscripten::value_object<cv::DMatch>("DMatch")
.field("queryIdx", &cv::DMatch::queryIdx)
.field("trainIdx", &cv::DMatch::trainIdx)
.field("imgIdx", &cv::DMatch::imgIdx)
.field("distance", &cv::DMatch::distance);
emscripten::value_array<cv::Scalar_<double>> ("Scalar") emscripten::value_array<cv::Scalar_<double>> ("Scalar")
.element(index<0>()) .element(index<0>())
.element(index<1>()) .element(index<1>())

@ -200,20 +200,19 @@ public:
{ {
int j; int j;
calc_non_rbf_base( vcount, var_count, vecs, another, results, calc_non_rbf_base( vcount, var_count, vecs, another, results,
-2*params.gamma, -2*params.coef0 ); 2*params.gamma, 2*params.coef0 );
// TODO: speedup this // TODO: speedup this
for( j = 0; j < vcount; j++ ) for( j = 0; j < vcount; j++ )
{ {
Qfloat t = results[j]; Qfloat t = results[j];
Qfloat e = std::exp(-std::abs(t)); Qfloat e = std::exp(std::abs(t));
if( t > 0 ) if( t > 0 )
results[j] = (Qfloat)((1. - e)/(1. + e));
else
results[j] = (Qfloat)((e - 1.)/(e + 1.)); results[j] = (Qfloat)((e - 1.)/(e + 1.));
else
results[j] = (Qfloat)((1. - e)/(1. + e));
} }
} }
void calc_rbf( int vcount, int var_count, const float* vecs, void calc_rbf( int vcount, int var_count, const float* vecs,
const float* another, Qfloat* results ) const float* another, Qfloat* results )
{ {
@ -1310,8 +1309,6 @@ public:
if( kernelType != SIGMOID && kernelType != POLY ) if( kernelType != SIGMOID && kernelType != POLY )
params.coef0 = 0; params.coef0 = 0;
else if( params.coef0 < 0 )
CV_Error( CV_StsOutOfRange, "The kernel parameter <coef0> must be positive or zero" );
if( kernelType != POLY ) if( kernelType != POLY )
params.degree = 0; params.degree = 0;

@ -88,6 +88,51 @@ void CV_SVMTrainAutoTest::run( int /*start_from*/ )
TEST(ML_SVM, trainauto) { CV_SVMTrainAutoTest test; test.safe_run(); } TEST(ML_SVM, trainauto) { CV_SVMTrainAutoTest test; test.safe_run(); }
TEST(ML_SVM, trainauto_sigmoid)
{
const int datasize = 100;
cv::Mat samples = cv::Mat::zeros( datasize, 2, CV_32FC1 );
cv::Mat responses = cv::Mat::zeros( datasize, 1, CV_32S );
const float scale_factor = 0.5;
const float radius = 2.0;
// Populate samples with data that can be split into two concentric circles
for (int i = 0; i < datasize; i+=2)
{
const float pi = 3.14159f;
const float angle_rads = (i/datasize) * pi;
const float x = radius * cos(angle_rads);
const float y = radius * cos(angle_rads);
// Larger circle
samples.at<float>( i, 0 ) = x;
samples.at<float>( i, 1 ) = y;
responses.at<int>( i, 0 ) = 0;
// Smaller circle
samples.at<float>( i + 1, 0 ) = x * scale_factor;
samples.at<float>( i + 1, 1 ) = y * scale_factor;
responses.at<int>( i + 1, 0 ) = 1;
}
cv::Ptr<TrainData> data = TrainData::create( samples, cv::ml::ROW_SAMPLE, responses );
cv::Ptr<SVM> svm = SVM::create();
svm->setKernel(SVM::SIGMOID);
svm->setGamma(10.0);
svm->setCoef0(-10.0);
svm->trainAuto( data, 10 ); // 2-fold cross validation.
float test_data0[2] = {radius, radius};
cv::Mat test_point0 = cv::Mat( 1, 2, CV_32FC1, test_data0 );
ASSERT_EQ(0, svm->predict( test_point0 ));
float test_data1[2] = {scale_factor * radius, scale_factor * radius};
cv::Mat test_point1 = cv::Mat( 1, 2, CV_32FC1, test_data1 );
ASSERT_EQ(1, svm->predict( test_point1 ));
}
TEST(ML_SVM, trainAuto_regression_5369) TEST(ML_SVM, trainAuto_regression_5369)
{ {

@ -323,7 +323,7 @@ def writeTextGraph(modelPath, outputPath, outNodes):
for node in graph_def.node: for node in graph_def.node:
if node.op == 'Const': if node.op == 'Const':
if 'value' in node.attr: if 'value' in node.attr and node.attr['value'].tensor.tensor_content:
del node.attr['value'] node.attr['value'].tensor.tensor_content = ''
tf.train.write_graph(graph_def, "", outputPath, as_text=True) tf.train.write_graph(graph_def, "", outputPath, as_text=True)

Loading…
Cancel
Save