diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index 8d9c639232..5284e4d4a7 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -22,6 +22,7 @@ class BatchNormLayerImpl : public BatchNormLayer { public: Mat weights_, bias_; + Mat weightMat, biasMat; BatchNormLayerImpl(const LayerParams& params) { @@ -96,17 +97,81 @@ public: return true; } + void finalize(const std::vector &inputs, std::vector &outputs) + { + if (inputs[0]->dims == 4) + { + int groups = inputs[0]->size[0]; + int channels = inputs[0]->size[1]; + int rows = inputs[0]->size[2]; + int cols = inputs[0]->size[3]; + MatShape s = shape(groups * channels, rows * cols); + weightMat = Mat(s[0], s[1], CV_32FC1); + biasMat = Mat(s[0], s[1], CV_32FC1); + for (int n = 0; n < s[0]; n++) + { + weightMat.row(n).setTo(weights_.at(n % channels)); + biasMat.row(n).setTo(bias_.at(n % channels)); + } + } + } + virtual bool supportBackend(int backendId) { return backendId == DNN_BACKEND_DEFAULT || backendId == DNN_BACKEND_HALIDE && haveHalide(); } +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) + { + std::vector inputs; + std::vector outputs; + + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + + CV_Assert(blobs.size() >= 2); + CV_Assert(inputs.size() == 1); + + UMat &inpBlob = inputs[0]; + CV_Assert(inpBlob.dims == 2 || inpBlob.dims == 4); + int groups = inpBlob.size[0]; + int channels = inpBlob.size[1]; + int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1; + int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1; + + for (size_t ii = 0; ii < outputs.size(); ii++) + { + if (inpBlob.dims == 2) + { + UMat& src = inputs[ii]; + UMat& dst = outputs[ii]; + multiply(src, weights_, dst); + add(dst, bias_, dst); + } + else + { + MatShape s = shape(groups * channels, rows * cols); + UMat src = inputs[ii].reshape(1, s.size(), &s[0]); + UMat dst = outputs[ii].reshape(1, s.size(), &s[0]); + multiply(src, weightMat, dst); + add(dst, biasMat, dst); + } + } + return true; + } +#endif + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); } diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index af2bfeb6d8..c5568b0d58 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -63,8 +63,22 @@ public: } #ifdef HAVE_OPENCL - bool forward_ocl(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) { + std::vector inputs; + std::vector outputs; + + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + + for (int i = 0, n = outputs.size(); i < n; ++i) + { + void *src_handle = inputs[i].handle(ACCESS_READ); + void *dst_handle = outputs[i].handle(ACCESS_WRITE); + if (src_handle != dst_handle) + inputs[i].copyTo(outputs[i]); + } + return true; } #endif diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 7e2214ea46..40375734d8 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -259,11 +259,63 @@ public: } }; +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) + { + std::vector inputs; + std::vector outputs; + + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + + switch (op) + { + case SUM: + if (coeffs.empty()) + { + add(inputs[0], inputs[1], outputs[0]); + for (int i = 2; i < inputs.size(); ++i) + add(outputs[0], inputs[i], outputs[0]); + } + else + { + UMat mul0, mul1; + multiply(coeffs[0], inputs[0], mul0); + multiply(coeffs[1], inputs[1], mul1); + add(mul0, mul1, outputs[0]); + for (int i = 2; i < inputs.size(); ++i) + { + multiply(coeffs[i], inputs[i], mul0); + add(mul0, outputs[0], outputs[0]); + } + } + break; + case PROD: + multiply(inputs[0], inputs[1], outputs[0]); + for (int i = 2; i < inputs.size(); ++i) + multiply(inputs[i], outputs[0], outputs[0]); + break; + case MAX: + max(inputs[0], inputs[1], outputs[0]); + for (int i = 2; i < inputs.size(); ++i) + max(inputs[i], outputs[0], outputs[0]); + break; + default: + return false; + } + return true; + } +#endif + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); } diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index 9d2940d6da..43445890f0 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -69,11 +69,74 @@ public: return true; } +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) + { + std::vector inputs; + std::vector outputs; + std::vector internals; + + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + internals_.getUMatVector(internals); + + CV_Assert(inputs.size() == 1 && outputs.size() == 1); + CV_Assert(inputs[0].total() == outputs[0].total()); + + const UMat& inp0 = inputs[0]; + UMat& buffer = internals[0]; + size_t num = inp0.size[0]; + size_t channels = inp0.size[1]; + size_t channelSize = inp0.total() / (num * channels); + for (size_t i = 0; i < num; ++i) + { + MatShape s = shape(channels, channelSize); + UMat src = inputs[i].reshape(1, s.size(), &s[0]); + UMat dst = outputs[i].reshape(1, s.size(), &s[0]); + + UMat abs_mat; + absdiff(src, cv::Scalar::all(0), abs_mat); + pow(abs_mat, pnorm, buffer); + + if (acrossSpatial) + { + // add eps to avoid overflow + float absSum = sum(buffer)[0] + epsilon; + float norm = pow(absSum, 1.0f / pnorm); + multiply(src, 1.0f / norm, dst); + } + + if (!blobs.empty()) + { + // scale the output + Mat scale = blobs[0]; + if (scale.total() == 1) + { + // _scale: 1 x 1 + multiply(dst, scale.at(0, 0), dst); + } + else + { + // _scale: _channels x 1 + CV_Assert(scale.total() == channels); + repeat(scale, 1, dst.cols, buffer); + multiply(dst, buffer, dst); + } + } + } + return true; + } +#endif + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); } diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index a966e84bd1..cd23541aed 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -320,6 +320,11 @@ TEST(Layer_Test_Eltwise, Accuracy) testLayerUsingCaffeModels("layer_eltwise"); } +OCL_TEST(Layer_Test_Eltwise, Accuracy) +{ + testLayerUsingCaffeModels("layer_eltwise", DNN_TARGET_OPENCL); +} + TEST(Layer_Test_PReLU, Accuracy) { testLayerUsingCaffeModels("layer_prelu", DNN_TARGET_CPU, true); diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 2cad882b5a..bde5760bfc 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -76,7 +76,7 @@ static std::string path(const std::string& file) return findDataFile("dnn/tensorflow/" + file, false); } -static void runTensorFlowNet(const std::string& prefix, bool hasText = false, +static void runTensorFlowNet(const std::string& prefix, int targetId = DNN_TARGET_CPU, bool hasText = false, double l1 = 1e-5, double lInf = 1e-4, bool memoryLoad = false) { @@ -104,6 +104,9 @@ static void runTensorFlowNet(const std::string& prefix, bool hasText = false, ASSERT_FALSE(net.empty()); + net.setPreferableBackend(DNN_BACKEND_DEFAULT); + net.setPreferableTarget(targetId); + cv::Mat input = blobFromNPY(inpPath); cv::Mat target = blobFromNPY(outPath); @@ -132,6 +135,11 @@ TEST(Test_TensorFlow, eltwise_add_mul) runTensorFlowNet("eltwise_add_mul"); } +OCL_TEST(Test_TensorFlow, eltwise_add_mul) +{ + runTensorFlowNet("eltwise_add_mul", DNN_TARGET_OPENCL); +} + TEST(Test_TensorFlow, pad_and_concat) { runTensorFlowNet("pad_and_concat"); @@ -141,7 +149,14 @@ TEST(Test_TensorFlow, batch_norm) { runTensorFlowNet("batch_norm"); runTensorFlowNet("fused_batch_norm"); - runTensorFlowNet("batch_norm_text", true); + runTensorFlowNet("batch_norm_text", DNN_TARGET_CPU, true); +} + +OCL_TEST(Test_TensorFlow, batch_norm) +{ + runTensorFlowNet("batch_norm", DNN_TARGET_OPENCL); + runTensorFlowNet("fused_batch_norm", DNN_TARGET_OPENCL); + runTensorFlowNet("batch_norm_text", DNN_TARGET_OPENCL, true); } TEST(Test_TensorFlow, pooling) @@ -179,15 +194,15 @@ TEST(Test_TensorFlow, fp16) { const float l1 = 1e-3; const float lInf = 1e-2; - runTensorFlowNet("fp16_single_conv", false, l1, lInf); - runTensorFlowNet("fp16_deconvolution", false, l1, lInf); - runTensorFlowNet("fp16_max_pool_odd_same", false, l1, lInf); - runTensorFlowNet("fp16_padding_valid", false, l1, lInf); - runTensorFlowNet("fp16_eltwise_add_mul", false, l1, lInf); - runTensorFlowNet("fp16_max_pool_odd_valid", false, l1, lInf); - runTensorFlowNet("fp16_pad_and_concat", false, l1, lInf); - runTensorFlowNet("fp16_max_pool_even", false, l1, lInf); - runTensorFlowNet("fp16_padding_same", false, l1, lInf); + runTensorFlowNet("fp16_single_conv", DNN_TARGET_CPU, false, l1, lInf); + runTensorFlowNet("fp16_deconvolution", DNN_TARGET_CPU, false, l1, lInf); + runTensorFlowNet("fp16_max_pool_odd_same", DNN_TARGET_CPU, false, l1, lInf); + runTensorFlowNet("fp16_padding_valid", DNN_TARGET_CPU, false, l1, lInf); + runTensorFlowNet("fp16_eltwise_add_mul", DNN_TARGET_CPU, false, l1, lInf); + runTensorFlowNet("fp16_max_pool_odd_valid", DNN_TARGET_CPU, false, l1, lInf); + runTensorFlowNet("fp16_pad_and_concat", DNN_TARGET_CPU, false, l1, lInf); + runTensorFlowNet("fp16_max_pool_even", DNN_TARGET_CPU, false, l1, lInf); + runTensorFlowNet("fp16_padding_same", DNN_TARGET_CPU, false, l1, lInf); } TEST(Test_TensorFlow, quantized) @@ -267,7 +282,7 @@ OCL_TEST(Test_TensorFlow, MobileNet_SSD) TEST(Test_TensorFlow, lstm) { - runTensorFlowNet("lstm", true); + runTensorFlowNet("lstm", DNN_TARGET_CPU, true); } TEST(Test_TensorFlow, split) @@ -284,11 +299,11 @@ TEST(Test_TensorFlow, memory_read) { double l1 = 1e-5; double lInf = 1e-4; - runTensorFlowNet("lstm", true, l1, lInf, true); + runTensorFlowNet("lstm", DNN_TARGET_CPU, true, l1, lInf, true); - runTensorFlowNet("batch_norm", false, l1, lInf, true); - runTensorFlowNet("fused_batch_norm", false, l1, lInf, true); - runTensorFlowNet("batch_norm_text", true, l1, lInf, true); + runTensorFlowNet("batch_norm", DNN_TARGET_CPU, false, l1, lInf, true); + runTensorFlowNet("fused_batch_norm", DNN_TARGET_CPU, false, l1, lInf, true); + runTensorFlowNet("batch_norm_text", DNN_TARGET_CPU, true, l1, lInf, true); } } diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index cbac6c5525..f7471dd144 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -170,6 +170,11 @@ TEST(Torch_Importer, run_batch_norm) runTorchNet("net_batch_norm", DNN_TARGET_CPU, "", false, true); } +OCL_TEST(Torch_Importer, run_batch_norm) +{ + runTorchNet("net_batch_norm", DNN_TARGET_OPENCL, "", false, true); +} + TEST(Torch_Importer, net_prelu) { runTorchNet("net_prelu"); @@ -225,6 +230,11 @@ TEST(Torch_Importer, net_normalize) runTorchNet("net_normalize", DNN_TARGET_CPU, "", false, true); } +OCL_TEST(Torch_Importer, net_normalize) +{ + runTorchNet("net_normalize", DNN_TARGET_OPENCL, "", false, true); +} + TEST(Torch_Importer, net_padding) { runTorchNet("net_padding", DNN_TARGET_CPU, "", false, true); @@ -237,6 +247,11 @@ TEST(Torch_Importer, net_non_spatial) runTorchNet("net_non_spatial", DNN_TARGET_CPU, "", false, true); } +OCL_TEST(Torch_Importer, net_non_spatial) +{ + runTorchNet("net_non_spatial", DNN_TARGET_OPENCL, "", false, true); +} + TEST(Torch_Importer, ENet_accuracy) { Net net;