From dc93eedecfa7b6a6469ea4282b708435ee356ae0 Mon Sep 17 00:00:00 2001 From: dkurt Date: Wed, 14 Jun 2017 16:52:08 +0300 Subject: [PATCH] Added Halide backend support for deep learning layers --- modules/dnn/include/opencv2/dnn/dnn.hpp | 127 ++++ modules/dnn/perf/perf_halide_net.cpp | 141 ++++ modules/dnn/perf/perf_main.cpp | 11 +- modules/dnn/samples/squeezenet_halide.cpp | 126 ++++ modules/dnn/src/dnn.cpp | 301 ++++++++- modules/dnn/src/halide_scheduler.cpp | 260 +++++++ modules/dnn/src/halide_scheduler.hpp | 37 + modules/dnn/src/layers/batch_norm_layer.cpp | 74 ++ modules/dnn/src/layers/concat_layer.cpp | 53 ++ modules/dnn/src/layers/convolution_layer.cpp | 151 +++++ modules/dnn/src/layers/elementwise_layers.cpp | 112 +++ modules/dnn/src/layers/eltwise_layer.cpp | 77 +++ .../dnn/src/layers/fully_connected_layer.cpp | 56 ++ modules/dnn/src/layers/lrn_layer.cpp | 74 ++ .../dnn/src/layers/max_unpooling_layer.cpp | 56 ++ modules/dnn/src/layers/pooling_layer.cpp | 133 ++++ modules/dnn/src/layers/scale_layer.cpp | 59 ++ modules/dnn/src/layers/softmax_layer.cpp | 59 ++ modules/dnn/src/layers/split_layer.cpp | 3 +- modules/dnn/src/op_halide.cpp | 172 +++++ modules/dnn/src/op_halide.hpp | 82 +++ modules/dnn/src/tensorflow/tf_importer.cpp | 2 - modules/dnn/test/test_halide_layers.cpp | 637 ++++++++++++++++++ modules/dnn/test/test_halide_nets.cpp | 124 ++++ .../tutorials/tutorial_dnn_halide.markdown | 135 ++++ .../tutorial_dnn_halide_scheduling.markdown | 83 +++ 26 files changed, 3138 insertions(+), 7 deletions(-) create mode 100644 modules/dnn/perf/perf_halide_net.cpp create mode 100644 modules/dnn/samples/squeezenet_halide.cpp create mode 100644 modules/dnn/src/halide_scheduler.cpp create mode 100644 modules/dnn/src/halide_scheduler.hpp create mode 100644 modules/dnn/src/op_halide.cpp create mode 100644 modules/dnn/src/op_halide.hpp create mode 100644 modules/dnn/test/test_halide_layers.cpp create mode 100644 modules/dnn/test/test_halide_nets.cpp create mode 100644 modules/dnn/tutorials/tutorial_dnn_halide.markdown create mode 100644 modules/dnn/tutorials/tutorial_dnn_halide_scheduling.markdown diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 384bcb530..50df76491 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -55,6 +55,23 @@ namespace dnn //! This namespace is used for dnn module functionlaity. typedef std::vector MatShape; + /** + * @brief Enum of computation backends supported by layers. + */ + enum Backend + { + DNN_BACKEND_DEFAULT, + DNN_BACKEND_HALIDE + }; + + /** + * @brief Enum of target devices for computations. + */ + enum Target + { + DNN_TARGET_CPU + }; + /** @brief Initialize dnn module and built-in layers. * * This function automatically called on most of OpenCV builds, @@ -77,6 +94,54 @@ namespace dnn //! This namespace is used for dnn module functionlaity. String type; //!< Type name which was used for creating layer by layer factory (optional). }; + /** + * @brief Derivatives of this class encapsulates functions of certain backends. + */ + class BackendNode + { + public: + BackendNode(int backendId); + + virtual ~BackendNode(); //!< Virtual destructor to make polymorphism. + + int backendId; //!< Backend identifier. + }; + + /** + * @brief Derivatives of this class wraps cv::Mat for different backends and targets. + */ + class BackendWrapper + { + public: + BackendWrapper(int backendId, int targetId); + + /** + * @brief Wrap cv::Mat for specific backend and target. + * @param[in] targetId Target identifier. + * @param[in] m cv::Mat for wrapping. + * + * Make CPU->GPU data transfer if it's require for the target. + */ + BackendWrapper(int targetId, const cv::Mat& m); + + /** + * @brief Make wrapper for reused cv::Mat. + * @param[in] base Wrapper of cv::Mat that will be reused. + * @param[in] shape Specific shape. + * + * Initialize wrapper from another one. It'll wrap the same host CPU + * memory and mustn't allocate memory on device(i.e. GPU). It might + * has different shape. Use in case of CPU memory reusing for reuse + * associented memory on device too. + */ + BackendWrapper(const Ptr& base, const MatShape& shape); + + virtual ~BackendWrapper(); //!< Virtual destructor to make polymorphism. + + int backendId; //!< Backend identifier. + int targetId; //!< Target identifier. + }; + /** @brief This interface class allows to build new Layers - are building blocks of networks. * * Each class, derived from Layer, must implement allocate() methods to declare own outputs and forward() to compute outputs. @@ -131,6 +196,50 @@ namespace dnn //! This namespace is used for dnn module functionlaity. */ virtual int outputNameToIndex(String outputName); + /** + * @brief Ask layer if it support specific backend for doing computations. + * @param[in] backendId computation backend identifier. + * @see Backend + */ + virtual bool supportBackend(int backendId); + + /** + * @brief Returns Halide backend node. + * @param[in] inputs Input Halide buffers. + * @see BackendNode, BackendWrapper + * + * Input buffers should be exactly the same that will be used in forward invocations. + * Despite we can use Halide::ImageParam based on input shape only, + * it helps prevent some memory management issues (if something wrong, + * Halide tests will be failed). + */ + virtual Ptr initHalide(const std::vector > &inputs); + + /** + * @brief Automatic Halide scheduling based on layer hyper-parameters. + * @param[in] node Backend node with Halide functions. + * @param[in] inputs Blobs that will be used in forward invocations. + * @param[in] outputs Blobs that will be used in forward invocations. + * @see BackendNode + * + * Layer don't use own Halide::Func members because we can have applied + * layers fusing. In this way the fused function should be scheduled. + */ + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const; + + /** + * @brief Implement layers fusing. + * @param[in] node Backend node of bottom layer. + * @see BackendNode + * + * Actual for graph-based backends. If layer attached successfully, + * returns non-empty cv::Ptr to node of the same backend. + * Fuse only over the last function. + */ + virtual Ptr tryAttach(const Ptr& node); + virtual bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, @@ -251,6 +360,24 @@ namespace dnn //! This namespace is used for dnn module functionlaity. /** @overload */ void forwardOpt(const std::vector &toLayers); + /** + * @brief Compile Halide layers. + * @param[in] scheduler Path to YAML file with scheduling directives. + * @see setPreferableBackend + * + * Schedule layers that support Halide backend. Then compile them for + * specific target. For layers that not represented in scheduling file + * or if no manual scheduling used at all, automatic scheduling will be applied. + */ + void compileHalide(const std::string& scheduler = ""); + + /** + * @brief Ask network to use specific computation backend where it supported. + * @param[in] backendId backend identifier. + * @see Backend + */ + void setPreferableBackend(int backendId); + /** @brief Sets the new value for the layer output blob * @param outputName descriptor of the updating layer output blob. * @param blob new blob. diff --git a/modules/dnn/perf/perf_halide_net.cpp b/modules/dnn/perf/perf_halide_net.cpp new file mode 100644 index 000000000..ffb3c067f --- /dev/null +++ b/modules/dnn/perf/perf_halide_net.cpp @@ -0,0 +1,141 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. + +namespace cvtest +{ + +#ifdef HAVE_HALIDE +using namespace cv; +using namespace dnn; + +static void loadNet(const std::string& weights, const std::string& proto, + const std::string& scheduler, int inWidth, int inHeight, + const std::string& outputLayer, const std::string& framework, + int targetId, Net* net, int* outputLayerId) +{ + Mat input(inHeight, inWidth, CV_32FC3); + randu(input, 0.0f, 1.0f); + + if (framework == "caffe") + { + *net = cv::dnn::readNetFromCaffe(proto, weights); + } + else if (framework == "torch") + { + *net = cv::dnn::readNetFromTorch(weights); + } + else if (framework == "tensorflow") + { + *net = cv::dnn::readNetFromTensorflow(weights); + } + else + CV_Error(Error::StsNotImplemented, "Unknown framework " + framework); + + net->setBlob("", cv::dnn::blobFromImage(input, 1.0, false)); + net->setPreferableBackend(DNN_BACKEND_HALIDE); + net->compileHalide(scheduler); + *outputLayerId = net->getLayerId(outputLayer); + net->forward(*outputLayerId); +} + +PERF_TEST(GoogLeNet, HalidePerfTest) +{ + Net net; + int outputLayerId; + loadNet(findDataFile("dnn/bvlc_googlenet.caffemodel"), + findDataFile("dnn/bvlc_googlenet.prototxt"), + "", 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net, &outputLayerId); + + TEST_CYCLE_N(10) + { + net.forward(outputLayerId); + } + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(AlexNet, HalidePerfTest) +{ + Net net; + int outputLayerId; + loadNet(findDataFile("dnn/bvlc_alexnet.caffemodel"), + findDataFile("dnn/bvlc_alexnet.prototxt"), + findDataFile("dnn/halide_scheduler_alexnet.yml"), + 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net, &outputLayerId); + + TEST_CYCLE_N(10) + { + net.forward(outputLayerId); + } + SANITY_CHECK_NOTHING(); +} + +// PERF_TEST(ResNet50, HalidePerfTest) +// { +// Net net; +// int outputLayerId; +// loadNet(findDataFile("dnn/ResNet-50-model.caffemodel"), +// findDataFile("dnn/ResNet-50-deploy.prototxt"), +// findDataFile("dnn/halide_scheduler_resnet_50.yml"), +// 224, 224, "prob", "caffe", DNN_TARGET_CPU, &net, &outputLayerId); +// +// TEST_CYCLE_N(10) +// { +// net.forward(outputLayerId); +// } +// SANITY_CHECK_NOTHING(); +// } + +// PERF_TEST(SqueezeNet_v1_1, HalidePerfTest) +// { +// Net net; +// int outputLayerId; +// loadNet(findDataFile("dnn/squeezenet_v1_1.caffemodel"), +// findDataFile("dnn/squeezenet_v1_1.prototxt"), +// findDataFile("dnn/halide_scheduler_squeezenet_v1_1.yml"), +// 227, 227, "prob", "caffe", DNN_TARGET_CPU, &net, &outputLayerId); +// +// TEST_CYCLE_N(10) +// { +// net.forward(outputLayerId); +// } +// SANITY_CHECK_NOTHING(); +// } + +PERF_TEST(Inception_5h, HalidePerfTest) +{ + Net net; + int outputLayerId; + loadNet(findDataFile("dnn/tensorflow_inception_graph.pb"), "", + findDataFile("dnn/halide_scheduler_inception_5h.yml"), + 224, 224, "softmax2", "tensorflow", DNN_TARGET_CPU, + &net, &outputLayerId); + + TEST_CYCLE_N(10) + { + net.forward(outputLayerId); + } + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(ENet, HalidePerfTest) +{ + Net net; + int outputLayerId; + loadNet(findDataFile("dnn/Enet-model-best.net"), "", + findDataFile("dnn/halide_scheduler_enet.yml"), + 512, 256, "l367_Deconvolution", "torch", DNN_TARGET_CPU, + &net, &outputLayerId); + + TEST_CYCLE_N(10) + { + net.forward(outputLayerId); + } + SANITY_CHECK_NOTHING(); +} +#endif // HAVE_HALIDE + +} // namespace cvtest diff --git a/modules/dnn/perf/perf_main.cpp b/modules/dnn/perf/perf_main.cpp index 17980204b..d66f19c9d 100644 --- a/modules/dnn/perf/perf_main.cpp +++ b/modules/dnn/perf/perf_main.cpp @@ -1,3 +1,12 @@ #include "perf_precomp.hpp" -CV_PERF_TEST_MAIN(dnn) +static const char* extraTestDataPath = +#ifdef WINRT + NULL; +#else + getenv("OPENCV_DNN_TEST_DATA_PATH"); +#endif + +CV_PERF_TEST_MAIN(dnn, + extraTestDataPath ? (void)cvtest::addDataSearchPath(extraTestDataPath) : (void)0 +) diff --git a/modules/dnn/samples/squeezenet_halide.cpp b/modules/dnn/samples/squeezenet_halide.cpp new file mode 100644 index 000000000..fd94c601a --- /dev/null +++ b/modules/dnn/samples/squeezenet_halide.cpp @@ -0,0 +1,126 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. + +// Sample of using Halide backend in OpenCV deep learning module. +// Based on dnn/samples/caffe_googlenet.cpp. + +#include +#include +#include +using namespace cv; +using namespace cv::dnn; + +#include +#include +#include + +/* Find best class for the blob (i. e. class with maximal probability) */ +void getMaxClass(const Mat &probBlob, int *classId, double *classProb) +{ + Mat probMat = probBlob.reshape(1, 1); //reshape the blob to 1x1000 matrix + Point classNumber; + + minMaxLoc(probMat, NULL, classProb, NULL, &classNumber); + *classId = classNumber.x; +} + +std::vector readClassNames(const char *filename = "synset_words.txt") +{ + std::vector classNames; + + std::ifstream fp(filename); + if (!fp.is_open()) + { + std::cerr << "File with classes labels not found: " << filename << std::endl; + exit(-1); + } + + std::string name; + while (!fp.eof()) + { + std::getline(fp, name); + if (name.length()) + classNames.push_back( name.substr(name.find(' ')+1) ); + } + + fp.close(); + return classNames; +} + +int main(int argc, char **argv) +{ + initModule(); // Required if OpenCV is built as static libs. + + std::string modelTxt = "train_val.prototxt"; + std::string modelBin = "squeezenet_v1.1.caffemodel"; + std::string imageFile = (argc > 1) ? argv[1] : "space_shuttle.jpg"; + + //! [Read and initialize network] + Net net = dnn::readNetFromCaffe(modelTxt, modelBin); + //! [Read and initialize network] + + //! [Check that network was read successfully] + if (net.empty()) + { + std::cerr << "Can't load network by using the following files: " << std::endl; + std::cerr << "prototxt: " << modelTxt << std::endl; + std::cerr << "caffemodel: " << modelBin << std::endl; + std::cerr << "SqueezeNet v1.1 can be downloaded from:" << std::endl; + std::cerr << "https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1" << std::endl; + exit(-1); + } + //! [Check that network was read successfully] + + //! [Prepare blob] + Mat img = imread(imageFile); + if (img.empty()) + { + std::cerr << "Can't read image from the file: " << imageFile << std::endl; + exit(-1); + } + if (img.channels() != 3) + { + std::cerr << "Image " << imageFile << " isn't 3-channel" << std::endl; + exit(-1); + } + + resize(img, img, Size(227, 227)); // SqueezeNet v1.1 predict class by 3x227x227 input image. + Mat inputBlob = blobFromImage(img, 1.0, false); // Convert Mat to 4-dimensional batch. + //! [Prepare blob] + + //! [Set input blob] + net.setBlob("", inputBlob); // Set the network input. + //! [Set input blob] + + //! [Enable Halide backend] + net.setPreferableBackend(DNN_BACKEND_HALIDE); // Tell engine to use Halide where it possible. + //! [Enable Halide backend] + + //! [Compile Halide pipeline] + net.compileHalide(); // Compile Halide pipeline. + //! [Compile Halide pipeline] + + //! [Make forward pass] + net.forward(); // Compute output. + //! [Make forward pass] + + //! [Gather output] + Mat prob = net.getBlob("prob"); // Gather output of "prob" layer. + + int classId; + double classProb; + getMaxClass(prob, &classId, &classProb); // Find the best class. + //! [Gather output] + + //! [Print results] + std::vector classNames = readClassNames(); + std::cout << "Best class: #" << classId << " '" << classNames.at(classId) << "'" << std::endl; + std::cout << "Probability: " << classProb * 100 << "%" << std::endl; + //! [Print results] + + return 0; +} //main diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index cecf5aa95..597ca9ff2 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -40,6 +40,8 @@ //M*/ #include "precomp.hpp" +#include "op_halide.hpp" +#include "halide_scheduler.hpp" #include #include #include @@ -177,6 +179,121 @@ struct LayerPin } }; +// Objects of this class manages wrappers. For every CPU memory pointer and shape +// one and only wrapper. Now it support wrapping for single backend and target. +class BackendWrapManager +{ +public: + Ptr wrap(const Mat& m, int backendId, int targetId = DNN_TARGET_CPU) + { + CV_Assert(backendId != DNN_BACKEND_DEFAULT); + + std::map >::iterator hostsIt; + // Check that the same CPU memory was previously wrapped. + hostsIt = hostWrappers.find(m.data); + if (hostsIt == hostWrappers.end()) + { + // If not wrapped before. + return (hostWrappers[m.data] = wrapHost(m, backendId, targetId)); + } + else + { + // Find if wrapper of this host and shape was created before. + std::map, Ptr >::iterator it; + std::pair key(m.data, m.size); + it = extraWrappers.find(key); + if (it == extraWrappers.end()) + { + MatShape shape(m.dims); + for (int i = 0; i < m.dims; ++i) + shape[i] = m.size.p[i]; + return (extraWrappers[key] = wrapUser(hostsIt->second, shape)); + } + else + return it->second; + } + } + + std::vector > wrap(const std::vector& mats, + int backendId, int targetId = DNN_TARGET_CPU) + { + const int num = mats.size(); + std::vector > dst(num); + for (int i = 0; i < num; ++i) + { + dst[i] = wrap(*mats[i], backendId, targetId); + } + return dst; + } + + std::vector > wrap(const std::vector& mats, + int backendId, int targetId = DNN_TARGET_CPU) + { + const int num = mats.size(); + std::vector > dst(num); + for (int i = 0; i < num; ++i) + { + dst[i] = wrap(mats[i], backendId, targetId); + } + return dst; + } + + void reset() + { + hostWrappers.clear(); + extraWrappers.clear(); + } + +private: + // Backend-specific wrapping function. + Ptr wrapHost(const Mat& m, int backendId, int targetId) + { + if (backendId == DNN_BACKEND_DEFAULT) + { + return Ptr(); + } + else if (backendId == DNN_BACKEND_HALIDE) + { + CV_Assert(haveHalide()); +#ifdef HAVE_HALIDE + return Ptr(new HalideBackendWrapper(targetId, m)); +#endif // HAVE_HALIDE + } + else + { + CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); + } + return Ptr(); + } + + // Backend-specific wrapping function. + Ptr wrapUser(const Ptr& host, const MatShape& shape) + { + int backendId = host->backendId; + if (backendId == DNN_BACKEND_DEFAULT) + { + return Ptr(); + } + else if (backendId == DNN_BACKEND_HALIDE) + { + CV_Assert(haveHalide()); +#ifdef HAVE_HALIDE + return Ptr(new HalideBackendWrapper(host, shape)); +#endif // HAVE_HALIDE + } + else + { + CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); + } + return Ptr(); + } + + // Wrappers that initialized for memory hosts (first wrapping of CPU data). + std::map > hostWrappers; + // The rest of wrappers. They initialized for non-host cv::Mat. + std::map, Ptr > extraWrappers; +}; + struct LayerData { LayerData() {} @@ -201,6 +318,10 @@ struct LayerData std::vector outputBlobs; std::vector inputBlobs; std::vector internals; + // Computation nodes of implemented backends (except DEFAULT). + std::map > backendNodes; + // Flag for skip layer computation for specific backend. + std::map skipFlags; int flag; @@ -439,7 +560,7 @@ public: } private: - // Registed allocated memory. + // Register allocated memory. void addHost(const LayerPin& lp, const Mat& mat) { CV_Assert(memHosts.find(lp) == memHosts.end()); @@ -472,6 +593,7 @@ struct Net::Impl lastLayerId = 1; netWasAllocated = false; + preferableBackend = DNN_BACKEND_DEFAULT; } Ptr netInputLayer; @@ -480,6 +602,9 @@ struct Net::Impl MapIdToLayerData layers; std::map layerNameToId; BlobManager blobManager; + int preferableBackend; + // Backend-specific wrapping manager. + BackendWrapManager backendWrapper; int lastLayerId; @@ -491,6 +616,7 @@ struct Net::Impl { allocateLayers(); computeNetOutputLayers(); + initBackend(); netWasAllocated = true; } @@ -646,6 +772,69 @@ struct Net::Impl #endif } + void initBackend() + { + backendWrapper.reset(); + if (preferableBackend == DNN_BACKEND_DEFAULT) + return; + + // Iterator to current layer. + MapIdToLayerData::iterator it = layers.begin(); + // Iterator to base layer for fusion. In example, in case of conv+bn+relu + // it'll be a conv layer. + MapIdToLayerData::iterator baseIt = layers.begin(); + for (; it != layers.end(); it++) + { + LayerData &ldTop = it->second; + Ptr layerTop = ldTop.layerInstance; + if (!layerTop->supportBackend(preferableBackend)) + { + // Move base iterator to layer that don't support preferable + // backend to prevent fusion over layer of different backend. + baseIt = it; + continue; + } + // Try to do layers fusion. + LayerData &ldBot = baseIt->second; + Ptr layerBot = ldBot.layerInstance; + // 1. Check that bottom and top from the same backends. + if (it != layers.begin() && layerBot->supportBackend(preferableBackend)) + { + // 2. Check that current layer works in-place. + bool inPlace = ldTop.inputBlobs.size() == 1 && + ldBot.outputBlobs.size() == 1 && + ldTop.inputBlobs[0]->data == + ldBot.outputBlobs[0].data; + if (inPlace) + { + // 3. Try to attach node. + CV_Assert(!ldBot.backendNodes[preferableBackend].empty()); + Ptr fusedNode = + layerTop->tryAttach(ldBot.backendNodes[preferableBackend]); + if (!fusedNode.empty()) + { + ldTop.skipFlags[preferableBackend] = true; + ldBot.backendNodes[preferableBackend] = fusedNode; + continue; + } + } + } + // No layers fusion. + ldTop.skipFlags[preferableBackend] = false; + std::vector > inputs = + backendWrapper.wrap(ldTop.inputBlobs, preferableBackend); + if (preferableBackend == DNN_BACKEND_HALIDE) + { + ldTop.backendNodes[DNN_BACKEND_HALIDE] = layerTop->initHalide(inputs); + baseIt = it; + } + else + { + CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); + } + } + } + #define CV_RETHROW_ERROR(err, newmsg)\ cv::error(err.code, newmsg, err.func.c_str(), err.file.c_str(), err.line) @@ -774,7 +963,26 @@ struct Net::Impl //forward itself //try { - ld.layerInstance->forward(ld.inputBlobs, ld.outputBlobs, ld.internals); + Ptr layer = ld.layerInstance; + if (preferableBackend == DNN_BACKEND_DEFAULT || + !layer->supportBackend(preferableBackend)) + { + layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals); + } + else if (!ld.skipFlags[preferableBackend]) + { + std::vector > outputs = + backendWrapper.wrap(ld.outputBlobs, preferableBackend); + Ptr node = ld.backendNodes[preferableBackend]; + if (preferableBackend == DNN_BACKEND_HALIDE) + { + forwardHalide(outputs, node); + } + else + { + CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); + } + } } /*catch (const cv::Exception &err) { @@ -905,7 +1113,14 @@ void Net::allocate() void Net::forward(LayerId toLayer) { - impl->setUpNet(); + if (!impl->netWasAllocated) + { + impl->setUpNet(); + // If user didn't call compileHalide() between + // setPreferableBackend(DNN_BACKEND_HALIDE) and forward(). + if (impl->preferableBackend == DNN_BACKEND_HALIDE) + compileHalide(); + } if (toLayer.isString() && toLayer.get().empty()) impl->forwardAll(); @@ -913,6 +1128,41 @@ void Net::forward(LayerId toLayer) impl->forwardLayer(impl->getLayerData(toLayer)); } +void Net::compileHalide(const std::string& configFile) +{ + CV_Assert(impl->preferableBackend == DNN_BACKEND_HALIDE); + if (!impl->netWasAllocated) + impl->setUpNet(); + + HalideScheduler scheduler(configFile); + Impl::MapIdToLayerData::iterator it; + for (it = impl->layers.begin(); it != impl->layers.end(); ++it) + { + LayerData &ld = it->second; + Ptr layer = ld.layerInstance; + if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skipFlags[DNN_BACKEND_HALIDE]) + { + CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty()); + bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]); + if (!scheduled) + { + // Use automatic scheduling provided by layer. + layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE], + ld.inputBlobs, ld.outputBlobs); + } + dnn::compileHalide(ld.outputBlobs, ld.backendNodes[DNN_BACKEND_HALIDE], + DNN_TARGET_CPU); + } + } +} + +void Net::setPreferableBackend(int backendId) +{ + impl->netWasAllocated = impl->netWasAllocated && + impl->preferableBackend == backendId; + impl->preferableBackend = backendId; +} + void Net::setNetInputs(const std::vector &inputBlobNames) { impl->netInputLayer->setNames(inputBlobNames); @@ -1295,6 +1545,30 @@ int Layer::outputNameToIndex(String) return -1; } +bool Layer::supportBackend(int backendId) +{ + return backendId == DNN_BACKEND_DEFAULT; +} + +Ptr Layer::initHalide(const std::vector > &) +{ + CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type + + " layers is not defined."); + return Ptr(); +} + +void Layer::applyHalideScheduler(Ptr& node, const std::vector &inputs, + const std::vector &outputs) const +{ + CV_Error(Error::StsNotImplemented, "Scheduling of " + type + + " layers is not implemented."); +} + +Ptr Layer::tryAttach(const Ptr& node) +{ + return Ptr(); +} + template static void vecToPVec(const std::vector &v, std::vector &pv) { @@ -1396,5 +1670,26 @@ Ptr LayerFactory::createLayerInstance(const String &_type, LayerParams& p } } +BackendNode::BackendNode(int backendId) : backendId(backendId) {} + +BackendNode::~BackendNode() {}; + +BackendWrapper::BackendWrapper(int backendId, int targetId) + : backendId(backendId), targetId(targetId) {} + +BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m) +{ + CV_Error(Error::StsNotImplemented, + "Constructor of backend wrapper must be implemented"); +} + +BackendWrapper::BackendWrapper(const Ptr& base, const MatShape& shape) +{ + CV_Error(Error::StsNotImplemented, + "Constructor of backend wrapper must be implemented"); +} + +BackendWrapper::~BackendWrapper() {} + } } diff --git a/modules/dnn/src/halide_scheduler.cpp b/modules/dnn/src/halide_scheduler.cpp new file mode 100644 index 000000000..04ac34b1f --- /dev/null +++ b/modules/dnn/src/halide_scheduler.cpp @@ -0,0 +1,260 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. + +#include "halide_scheduler.hpp" +#include "op_halide.hpp" + +namespace cv +{ +namespace dnn +{ + +#ifdef HAVE_HALIDE +static void applySplit(const FileNode& directive, Halide::Func& func, + const FileNode& params) +{ + for (const auto& varNode : directive) + { + const std::string varName = varNode.name(); + const std::string factorName = (std::string)varNode; + Halide::Var var(varName); + Halide::Var outerVar(varName + "o"); + Halide::Var innerVar(varName + "i"); + // If split factor is integer or parameters map has parameter value. + CV_Assert(varNode.isString() && !params[factorName].empty() || + varNode.isInt()); + int factor = (int)(varNode.isInt() ? varNode : params[factorName]); + func.split(var, outerVar, innerVar, factor); + } +} + +static void applyReorder(const FileNode& directive, Halide::Func& func) +{ + std::string varName; + const int numVars = directive.size(); + std::vector reorderedVars; + reorderedVars.reserve(numVars); + for (int i = 0; i < numVars; ++i) + { + directive[i] >> varName; + reorderedVars.push_back(Halide::Var(varName)); + } + func.reorder(reorderedVars); +} + +static void applyFuse(const FileNode& directive, Halide::Func& func) +{ + CV_Assert(directive["src"].size() >= 2); + CV_Assert(directive["dst"].size() == 1); + + std::string str; + directive["src"][0] >> str; + Halide::Var firstVar(str); + directive["src"][1] >> str; + Halide::Var secondVar(str); + directive["dst"] >> str; + Halide::Var dstVar(str); + + func.fuse(firstVar, secondVar, dstVar); + for (int i = 2, n = directive["src"].size(); i < n; ++i) + { + directive["src"][i] >> str; + func.fuse(Halide::Var(str), dstVar, dstVar); + } +} + +static void applyParallel(const FileNode& directive, Halide::Func& func) +{ + std::string varName; + for (int i = 0, n = directive.size(); i < n; ++i) + { + directive[i] >> varName; + func.parallel(Halide::Var(varName)); + } +} + +static void applyUnroll(const FileNode& directive, Halide::Func& func) +{ + std::string varName; + for (int i = 0, n = directive.size(); i < n; ++i) + { + directive[i] >> varName; + func.unroll(Halide::Var(varName)); + } +} + +static void applyVectorize(const FileNode& directive, Halide::Func& func, + const FileNode& params) +{ + for (const auto& varNode : directive) + { + const std::string varName = varNode.name(); + const std::string factorName = (std::string)varNode; + // If split factor is integer or parameters map has parameter value. + CV_Assert(varNode.isString() && !params[factorName].empty() || + varNode.isInt()); + int factor = (int)(varNode.isInt() ? varNode : params[factorName]); + Halide::Var var(varName); + Halide::Var inner(varName + "v"); + func.split(var, var, inner, factor); + func.vectorize(inner); + } +} + +static void applyStoreAt(const FileNode& directive, Halide::Func& func, + std::map& funcsMap) +{ + for (const auto& funcNode : directive) + { + const std::string targetFuncName = funcNode.name(); + if (funcsMap.find(targetFuncName) == funcsMap.end()) + CV_Error(cv::Error::StsParseError, "Function " + targetFuncName + + " is not represented in Halide pipeline"); + Halide::Func targetFunc = funcsMap[targetFuncName]; + func.store_at(targetFunc, (std::string)funcNode); + break; + } +} + +static void applyComputeAt(const FileNode& directive, Halide::Func& func, + std::map& funcsMap) +{ + for (const auto& funcNode : directive) + { + const std::string targetFuncName = funcNode.name(); + if (funcsMap.find(targetFuncName) == funcsMap.end()) + CV_Error(cv::Error::StsParseError, "Function " + targetFuncName + + " is not represented in Halide pipeline"); + Halide::Func targetFunc = funcsMap[targetFuncName]; + func.compute_at(targetFunc, (std::string)funcNode); + break; + } +} + +static void applyComputeRoot(const FileNode& directive, Halide::Func& func) +{ + bool compute_root; + directive >> compute_root; + if (compute_root) + func.compute_root(); +} + +static void apply(const FileNode& directives, Halide::Func& func, + std::map& funcsMap, + const FileNode& params) +{ + for (const auto& directive : directives) + { + if (directive.name() == "split") + applySplit(directive, func, params); + else if (directive.name() == "reorder") + applyReorder(directive, func); + else if (directive.name() == "fuse") + applyFuse(directive, func); + else if (directive.name() == "parallel") + applyParallel(directive, func); + else if (directive.name() == "unroll") + applyUnroll(directive, func); + else if (directive.name() == "vectorize") + applyVectorize(directive, func, params); + else if (directive.name() == "store_at") + applyStoreAt(directive, func, funcsMap); + else if (directive.name() == "compute_at") + applyComputeAt(directive, func, funcsMap); + else if (directive.name() == "compute_root") + applyComputeRoot(directive, func); + else + CV_Error(Error::StsNotImplemented, "Scheduling directive " + + directive.name() + " is not implemented."); + } +} + +// Remove any numeric symbols after '$' sign. +static std::string Deunique(std::string str) +{ + int pos = -1; + do + { + pos = str.find('$'); + if (pos != -1) + { + int len = str.find_first_not_of("0123456789", pos + 1) - pos; + str = str.replace(pos, len, ""); + } + } + while (pos != -1); + return str; +} +#endif // HAVE_HALIDE + +HalideScheduler::HalideScheduler(const std::string& configFile) +{ + if (!configFile.empty()) + fs = FileStorage(configFile, FileStorage::READ); +} + +HalideScheduler::~HalideScheduler() +{ + if (fs.isOpened()) + fs.release(); +} + +bool HalideScheduler::process(Ptr& node) +{ +#ifdef HAVE_HALIDE + if (!fs.isOpened()) + return false; + + const FileNode& scheduleNode = fs["scheduling"]; + if (scheduleNode.empty()) + CV_Error(cv::Error::StsParseError, "Scheduling file should has scheduling node"); + + std::string str; + std::map funcsMap; // Scheduled functions. + // For every function, from top to bottom, we try to find a scheduling node. + // Scheduling is successful (return true) if for the first function (top) + // node is respresented. + CV_Assert(!node.empty()); + std::vector& funcs = node.dynamicCast()->funcs; + for (int i = funcs.size() - 1; i >= 0; --i) + { + Halide::Func& func = funcs[i]; + // For functions with the same name Halide generates unique names + // for example func, func$1, func$2. + // They are always formed with '$' and number. + std::string funcName = Deunique(func.name()); + + const FileNode& funcNode = scheduleNode[funcName]; + if (!funcNode.empty()) + { + if (!funcNode["pattern"].empty()) + { + funcNode["pattern"] >> str; + if (fs["patterns"][str].empty()) + CV_Error(cv::Error::StsParseError, "Scheduling pattern " + str + + " is not defined"); + apply(fs["patterns"][str], func, funcsMap, funcNode["params"]); + } + else + { + apply(funcNode, func, funcsMap, funcNode["params"]); + } + } + else + { + if (funcsMap.empty()) + return false; + } + funcsMap[funcName] = func; + } + return true; +#endif // HAVE_HALIDE + return false; +} + +} // namespace dnn +} // namespace cv diff --git a/modules/dnn/src/halide_scheduler.hpp b/modules/dnn/src/halide_scheduler.hpp new file mode 100644 index 000000000..9afef3023 --- /dev/null +++ b/modules/dnn/src/halide_scheduler.hpp @@ -0,0 +1,37 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. + +#ifndef __OPENCV_DNN_HALIDE_SCHEDULER_HPP__ +#define __OPENCV_DNN_HALIDE_SCHEDULER_HPP__ + +#include + +namespace cv +{ +namespace dnn +{ + +class HalideScheduler +{ +public: + HalideScheduler(const std::string& configFile); + + ~HalideScheduler(); + + // Returns true if pipeline found in scheduling file. + // If more than one function, returns true if the top function scheduled. + // Other functions are optional to scheduling. + bool process(Ptr& node); + +private: + FileStorage fs; +}; + +} // namespace dnn +} // namespace cv + +#endif // __OPENCV_DNN_HALIDE_SCHEDULER_HPP__ diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index e5f5b68a0..7726a7b77 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -10,6 +10,7 @@ Implementation of Batch Normalization layer. */ #include "../precomp.hpp" +#include "op_halide.hpp" #include namespace cv @@ -39,6 +40,12 @@ public: return true; } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide(); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_Assert(blobs.size() >= 2); @@ -88,6 +95,73 @@ public: } } + virtual Ptr tryAttach(const Ptr& node) + { + switch (node->backendId) + { + case DNN_BACKEND_HALIDE: + { +#ifdef HAVE_HALIDE + auto base = node.dynamicCast(); + Halide::Func& input = base->funcs.back(); + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = attachHalide(input(x, y, c, n)); + return Ptr(new HalideBackendNode(base, top)); +#endif // HAVE_HALIDE + break; + } + } + return Ptr(); + } + + virtual Ptr initHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + Halide::Buffer input = halideBuffer(inputs[0]); + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = attachHalide(input(x, y, c, n)); + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + +#ifdef HAVE_HALIDE + // attachHalide can work both with Halide::Buffer and Halide::Func. In the + // second case it will be a fusion. + Halide::Func attachHalide(const Halide::Expr& input) + { + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::Var x("x"), y("y"), c("c"), n("n"); + + const int weightsBlobIndex = 2; + const int biasBlobIndex = weightsBlobIndex + hasWeights; + const int numChannels = blobs[0].total(); + float* meanData = (float*)blobs[0].data; + float* stdData = (float*)blobs[1].data; + float* weightsData = (hasWeights ? (float*)blobs[weightsBlobIndex].data : NULL); + float* biasData = (hasBias ? (float*)blobs[biasBlobIndex].data : NULL); + + float varMeanScale = 1.f; + if (!hasWeights && !hasBias) { + varMeanScale = *blobs[2].ptr(); + if (varMeanScale != 0) + varMeanScale = 1/varMeanScale; + } + + Halide::Buffer weights(numChannels); + Halide::Buffer bias(numChannels); + for (int i = 0; i < numChannels; ++i) + { + weights(i) = (hasWeights ? weightsData[i] : 1.0f) / + sqrt(stdData[i] * varMeanScale + epsilon); + bias(i) = (hasBias ? biasData[i] : 0.0f) - + weights(i) * meanData[i] * varMeanScale; + } + top(x, y, c, n) = input * weights(c) + bias(c); + return top; + } +#endif // HAVE_HALIDE + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index 551328ed2..d827f5e6d 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -41,6 +41,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "op_halide.hpp" namespace cv { @@ -86,6 +87,12 @@ public: return false; } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1; // By channels + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { int cAxis = clamp(axis, inputs[0]->dims); @@ -100,6 +107,52 @@ public: ranges[cAxis].start = ranges[cAxis].end; } } + + virtual Ptr initHalide(const std::vector > &input) + { +#ifdef HAVE_HALIDE + std::vector > inputBuffers = halideBuffers(input); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + int offset = inputBuffers[0].channels(); + Halide::Expr topExpr = select(c < offset, + inputBuffers[0](x, y, c, n), + inputBuffers[1](x, y, c - offset, n)); + for (int i = 2; i < input.size(); ++i) + { + offset += inputBuffers[i - 1].channels(); + topExpr = select(c < offset, topExpr, + inputBuffers[i](x, y, c - offset, n)); + } + top(x, y, c, n) = topExpr; + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const + { +#ifdef HAVE_HALIDE + Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo"); + Halide::Func& top = node.dynamicCast()->funcs.back(); + + int outW, outH, outC, outN; + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + + if (outW == 1 || outH <= 2) + return; + + top.reorder(x, c, y) + .split(y, yo, yi, 2) + .fuse(yo, n, tile) + .parallel(tile) + .unroll(yi) + .vectorize(x, outW >= 16 ? 16 : outW); +#endif // HAVE_HALIDE + } }; Ptr ConcatLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 6a14f92ce..1e7229b87 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -43,6 +43,7 @@ #include "layers_common.hpp" #include "op_im2col.hpp" #include "op_blas.hpp" +#include "op_halide.hpp" #include "opencv2/core/hal/intrin.hpp" #include @@ -64,6 +65,13 @@ public: } #endif } + + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide(); + } + void finalize(const std::vector &inputs, std::vector &outputs) { CV_Assert(inputs.size() > 0); @@ -98,6 +106,40 @@ public: (dilation.height == 1 && dilation.width == 1); } bool setActivation(const Ptr& ) { return false; } + + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const + { +#ifdef HAVE_HALIDE + Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo"), co("co"), ci("ci"); + Halide::Func& top = node.dynamicCast()->funcs[1]; + Halide::Func& padded_input = node.dynamicCast()->funcs[0]; + + int outW, outH, outC, outN; + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + + if (outW == 1 || outH <= 2) + return; + + if (is1x1() || outC <= 16) + top.reorder(x, c, y) + .split(y, yo, yi, 2) + .fuse(yo, n, tile) + .parallel(tile) + .unroll(yi) + .vectorize(x, outW >= 16 ? 16 : outW); + else + top.reorder(x, c, y) + .split(y, yo, yi, 2) + .split(c, co, ci, 16) + .fuse(yo, co, tile).fuse(n, tile, tile) + .parallel(tile) + .unroll(yi) + .vectorize(x, outW >= 16 ? 16 : outW); + padded_input.compute_at(top, yi); +#endif // HAVE_HALIDE + } }; //TODO: simultaneously convolution and bias addition for cache optimization @@ -155,6 +197,66 @@ public: bool setActivation(const Ptr& layer) { activ = layer; return true; } + virtual Ptr initHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + Halide::Buffer inputBuffer = halideBuffer(inputs[0]); + + const int inpCn = inputBuffer.channels(); + const int outCn = blobs[0].size[0]; + const int inpGroupCn = blobs[0].size[1]; + const int group = inpCn / inpGroupCn; + const int outGroupCn = outCn / group; + + Halide::Buffer weights = wrapToHalideBuffer(blobs[0]); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::Func padded_input(name + "_constant_exterior"); + if (pad.width || pad.height) + { + Halide::Func bounded = + Halide::BoundaryConditions::constant_exterior(inputBuffer, 0); + padded_input(x, y, c, n) = bounded(x, y, c, n); + } + else + { + padded_input(x, y, c, n) = inputBuffer(x, y, c, n); + } + + Halide::RDom r(0, kernel.width, 0, kernel.height, 0, inpGroupCn); + + Halide::Expr kc = r.z; + if (group > 1) + { + int outCnBound = outGroupCn; + int inpChBound = inpGroupCn; + Halide::Expr shift = select(c < outCnBound, 0, inpChBound); + for (int i = 2; i < group; ++i) + { + outCnBound += outGroupCn; + inpChBound += inpGroupCn; + shift = select(c < outCnBound, shift, inpChBound); + } + kc += shift; + } + + Halide::Expr kx = x * stride.width - pad.width + r.x * dilation.width; + Halide::Expr ky = y * stride.height - pad.height + r.y * dilation.height; + Halide::Expr topExpr = sum(padded_input(kx, ky, kc, n) * + weights(r.x, r.y, r.z, c)); + if (hasBias()) + { + Halide::Buffer bias = wrapToHalideBuffer(blobs[1], {outCn}); + topExpr += bias(c); + } + top(x, y, c, n) = topExpr; + Ptr pp(new HalideBackendNode({ padded_input, top })); + return Ptr(new HalideBackendNode({ padded_input, top })); +#endif // HAVE_HALIDE + return Ptr(); + } + class ParallelConv : public cv::ParallelLoopBody { public: @@ -644,6 +746,53 @@ public: dilation.height, dilation.width, dstImg.ptr(), &ofsbuf[0]); } + virtual Ptr initHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + Halide::Buffer inputBuffer = halideBuffer(inputs[0]); + + int inW, inH, inC, inN, outC = blobs[0].size[0]; + getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN); + + if (inC / blobs[0].size[1] != 1) + CV_Error(cv::Error::StsNotImplemented, + "Halide backend for Deconvolution with group > 1 is not implemented"); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::Func padded_input(name + "_constant_exterior"); + auto weights = wrapToHalideBuffer(blobs[0], {kernel.width, + kernel.height, outC, inC}); + + Halide::Func dilated_input("dilated_input"); + dilated_input(x, y, c, n) = 0.0f; + Halide::RDom r1(0, inW, 0, inH); + dilated_input(r1.x * stride.width, r1.y * stride.height, c, n) = + inputBuffer(r1.x, r1.y, c, n); + dilated_input.compute_root(); + + Halide::Func bounded = + Halide::BoundaryConditions::constant_exterior(dilated_input, 0, + 0, (inW - 1) * stride.width + 1, + 0, (inH - 1) * stride.height + 1, + 0, inC, 0, inN); + padded_input(x, y, c, n) = bounded(x, y, c, n); + + Halide::RDom r(0, kernel.width, 0, kernel.height, 0, inC); + Halide::Expr topExpr = sum( + padded_input(x + pad.width - r.x, y + pad.height - r.y, r.z, n) * + weights(r.x, r.y, c, r.z)); + if (hasBias()) + { + auto bias = wrapToHalideBuffer(blobs[1], {outC}); + topExpr += bias(c); + } + top(x, y, c, n) = topExpr; + return Ptr(new HalideBackendNode({ padded_input, top })); +#endif // HAVE_HALIDE + return Ptr(); + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { @@ -680,6 +829,8 @@ static void initConvDeconvLayerFromCaffe(Ptr l, const Laye CV_Assert(numOutput % ngroups == 0); CV_Assert((bias && l->blobs.size() == 2) || (!bias && l->blobs.size() == 1)); + CV_Assert(l->adjustPad.width < l->stride.width && + l->adjustPad.height < l->stride.height); } Ptr ConvolutionLayer::create(const LayerParams ¶ms) diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index c43b3fb77..679bb498d 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -1,4 +1,5 @@ #include "../precomp.hpp" +#include "op_halide.hpp" #include "opencv2/imgproc.hpp" #include @@ -64,6 +65,44 @@ public: ElementWiseLayer(const Func &f=Func()) { func = f; } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide(); + } + + virtual Ptr tryAttach(const Ptr& node) + { + switch (node->backendId) + { + case DNN_BACKEND_HALIDE: + { +#ifdef HAVE_HALIDE + auto base = node.dynamicCast(); + Halide::Func& input = base->funcs.back(); + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (this->name.empty() ? Halide::Func() : Halide::Func(this->name)); + func.attachHalide(input(x, y, c, n), top); + return Ptr(new HalideBackendNode(base, top)); +#endif // HAVE_HALIDE + break; + } + } + return Ptr(); + } + + virtual Ptr initHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + Halide::Buffer input = halideBuffer(inputs[0]); + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (this->name.empty() ? Halide::Func() : Halide::Func(this->name)); + func.attachHalide(input(x, y, c, n), top); + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, @@ -147,6 +186,21 @@ struct ReLUFunctor } } +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + if (slope) + { + top(x, y, c, n) = select(input >= 0.0f, input, slope); + } + else + { + top(x, y, c, n) = max(input, 0.0f); + } + } +#endif // HAVE_HALIDE + int64 getFLOPSPerElement() const { return 1; } }; @@ -166,6 +220,14 @@ struct TanHFunctor } } +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + top(x, y, c, n) = tanh(input); + } +#endif // HAVE_HALIDE + int64 getFLOPSPerElement() const { return 1; } }; @@ -185,6 +247,14 @@ struct SigmoidFunctor } } +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + top(x, y, c, n) = 1.0f / (1.0f + exp(-input)); + } +#endif // HAVE_HALIDE + int64 getFLOPSPerElement() const { return 3; } }; @@ -204,6 +274,14 @@ struct AbsValFunctor } } +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + top(x, y, c, n) = abs(input); + } +#endif // HAVE_HALIDE + int64 getFLOPSPerElement() const { return 1; } }; @@ -223,6 +301,14 @@ struct BNLLFunctor } } +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + top(x, y, c, n) = log(1.0f + exp(-abs(input))); + } +#endif // HAVE_HALIDE + int64 getFLOPSPerElement() const { return 5; } }; @@ -264,6 +350,23 @@ struct PowerFunctor } } +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Expr topExpr = (scale == 1.0f ? input : input * scale); + if (shift) + { + topExpr += shift; + } + if (power != 1.0f) + { + topExpr = pow(topExpr, power); + } + top(x, y, c, n) = topExpr; + } +#endif // HAVE_HALIDE + int64 getFLOPSPerElement() const { return power == 1 ? 2 : 10; } }; @@ -314,6 +417,15 @@ struct ChannelsPReLUFunctor } } +#ifdef HAVE_HALIDE + void attachHalide(const Halide::Expr& input, Halide::Func& top) + { + Halide::Var x("x"), y("y"), c("c"), n("n"); + auto weights = wrapToHalideBuffer(scale, {(int)scale.total()}); + top(x, y, c, n) = select(input > 0.0f, input, weights(c) * input); + } +#endif // HAVE_HALIDE + int64 getFLOPSPerElement() const { return 1; } }; diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 3d3e64e40..2c9cf8942 100755 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -41,6 +41,8 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "op_halide.hpp" + namespace cv { namespace dnn @@ -81,6 +83,12 @@ public: } } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide(); + } + bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, @@ -144,6 +152,75 @@ public: } } + virtual Ptr initHalide(const std::vector > &input) + { +#ifdef HAVE_HALIDE + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::Expr topExpr; + std::vector > inputBuffers = halideBuffers(input); + switch (op) + { + case SUM: + if (coeffs.empty()) + { + topExpr = inputBuffers[0](x, y, c, n) + + inputBuffers[1](x, y, c, n); + for (int i = 2; i < inputBuffers.size(); ++i) + topExpr += inputBuffers[i](x, y, c, n); + } + else + { + topExpr = coeffs[0] * inputBuffers[0](x, y, c, n) + + coeffs[1] * inputBuffers[1](x, y, c, n); + for (int i = 2; i < inputBuffers.size(); ++i) + topExpr += coeffs[i] * inputBuffers[i](x, y, c, n); + } + break; + case PROD: + topExpr = inputBuffers[0](x, y, c, n) * + inputBuffers[1](x, y, c, n); + for (int i = 2; i < inputBuffers.size(); ++i) + topExpr *= inputBuffers[i](x, y, c, n); + break; + case MAX: + topExpr = max(inputBuffers[0](x, y, c, n), + inputBuffers[1](x, y, c, n)); + for (int i = 2; i < inputBuffers.size(); ++i) + topExpr = max(topExpr, inputBuffers[i](x, y, c, n)); + break; + default: + return Ptr(); + } + top(x, y, c, n) = topExpr; + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const + { +#ifdef HAVE_HALIDE + Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo"); + Halide::Func& top = node.dynamicCast()->funcs.back(); + + int outW, outH, outC, outN; + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + + if (outW == 1 || outH <= 2) + return; + + top.reorder(x, c, y) + .split(y, yo, yi, 2) + .fuse(yo, n, tile) + .parallel(tile) + .unroll(yi) + .vectorize(x, outW >= 16 ? 16 : outW); +#endif // HAVE_HALIDE + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 3c5cc3a79..e9ac5576d 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -42,6 +42,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include "op_blas.hpp" +#include "op_halide.hpp" #include namespace cv @@ -104,6 +105,12 @@ public: return false; } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1; + } + class FullConnected : public ParallelLoopBody { public: @@ -213,6 +220,55 @@ public: } } + virtual Ptr initHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + int inW, inH, inC, inN, outC = blobs[0].size[0]; + Halide::Buffer inputBuffer = halideBuffer(inputs[0]); + getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN); + auto weights = wrapToHalideBuffer(blobs[0], {inW, inH, inC, outC}); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::RDom r(0, inW, 0, inH, 0, inC); + Halide::Expr topExpr = sum(inputBuffer(r.x, r.y, r.z, n) * + weights(r.x, r.y, r.z, c)); + if (bias) + { + Halide::Buffer bias = wrapToHalideBuffer(blobs[1], {outC}); + topExpr += bias(c); + } + top(x, y, c, n) = topExpr; + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const + { +#ifdef HAVE_HALIDE + int outW, outH, outC, outN; + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + + Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), tile("tile"); + Halide::Func& top = node.dynamicCast()->funcs.back(); + + if (outC + outN == 1) + return; + + if (outC > 8) + top.split(c, co, ci, 8) + .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) + .parallel(tile) + .vectorize(ci, 8); + else + top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile) + .parallel(tile); +#endif // HAVE_HALIDE + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp index 4c3cbccd7..abe1784e4 100644 --- a/modules/dnn/src/layers/lrn_layer.cpp +++ b/modules/dnn/src/layers/lrn_layer.cpp @@ -41,6 +41,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "op_halide.hpp" #include "opencv2/imgproc.hpp" #include "opencv2/dnn/shape_utils.hpp" #include "opencv2/core/hal/hal.hpp" @@ -76,6 +77,12 @@ public: normBySize = params.get("norm_by_size", true); } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide(); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_Assert(inputs.size() == outputs.size()); @@ -222,6 +229,73 @@ public: } } + virtual Ptr initHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + float alphaSize = alpha; + if (normBySize) + alphaSize /= (type == CHANNEL_NRM ? size : size * size); + int width, height, channels, numImgs; + Halide::Buffer inputBuffer = halideBuffer(inputs[0]); + getCanonicalSize(inputBuffer, &width, &height, &channels, &numImgs); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::Func padded_sq(name + "_padded_sq"); + Halide::Func sq("sq"); + sq(x, y, c, n) = inputBuffer(x, y, c, n) * inputBuffer(x, y, c, n); + + Halide::Func bounded = + Halide::BoundaryConditions::constant_exterior(sq, 0, 0, width, + 0, height, + 0, channels, + 0, numImgs); + padded_sq(x, y, c, n) = bounded(x, y, c, n); + + Halide::Expr base; + if (type == CHANNEL_NRM) + { + Halide::RDom r((1 - size) / 2, size); + base = alphaSize * sum(padded_sq(x, y, c + r, n)); + } + else // SPATIAL_NRM + { + Halide::RDom r((1 - size) / 2, size, (1 - size) / 2, size); + base = alphaSize * sum(padded_sq(x + r.x, y + r.y, c, n)); + } + base += static_cast(bias); + top(x, y, c, n) = inputBuffer(x, y, c, n) / pow(base, beta); + return Ptr(new HalideBackendNode({ padded_sq, top })); +#endif // HAVE_HALIDE + return Ptr(); + } + + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const + { +#ifdef HAVE_HALIDE + int outW, outH, outC, outN; + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + + Halide::Var x("x"), y("y"), c("c"), n("n"), yo("yo"), yi("yi"), tile("tile"); + Halide::Func& top = node.dynamicCast()->funcs[1]; + Halide::Func& padded_sq = node.dynamicCast()->funcs[0]; + + if (outW < 8 || outH <= 2) + return; + + top.reorder(x, c, y, n) + .split(y, yo, yi, 2) + .fuse(yo, n, tile) + .parallel(tile) + .unroll(yi) + .vectorize(x, 8); + padded_sq.store_at(top, tile) + .compute_at(top, yi); +#endif // HAVE_HALIDE + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { diff --git a/modules/dnn/src/layers/max_unpooling_layer.cpp b/modules/dnn/src/layers/max_unpooling_layer.cpp index 770df9e5c..0b7ad8c62 100644 --- a/modules/dnn/src/layers/max_unpooling_layer.cpp +++ b/modules/dnn/src/layers/max_unpooling_layer.cpp @@ -11,6 +11,7 @@ Implementation of Batch Normalization layer. #include "../precomp.hpp" #include "layers_common.hpp" +#include "op_halide.hpp" #include namespace cv @@ -29,6 +30,13 @@ public: poolStride = Size(params.get("pool_stride_w"), params.get("pool_stride_h")); } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide() && + !poolPad.width && !poolPad.height; + } + bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, @@ -81,6 +89,54 @@ public: } } } + + virtual Ptr initHalide(const std::vector > &input) + { +#ifdef HAVE_HALIDE + // Meaningless operation if false because if kernel > stride + // it is not deterministic and if kernel < stride we just + // skip a part of input data (you'd better change your model). + if (poolKernel.width != poolStride.width || + poolKernel.height != poolStride.height) + CV_Error(cv::Error::StsNotImplemented, + "Halide backend for maximum unpooling " + "is not support cases when kernel != stride"); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::Buffer inputBuffer = halideBuffer(input[0]); + Halide::Buffer indices = halideBuffer(input[1]); + + Halide::Expr pooledX = x / poolKernel.width; + Halide::Expr pooledY = y / poolKernel.height; + + const int outW = inputBuffer.width() * poolKernel.width; + top(x, y, c, n) = select(y * outW + x == indices(pooledX, pooledY, c, n), + inputBuffer(pooledX, pooledY, c, n), 0.0f); + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const + { +#ifdef HAVE_HALIDE + Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), yi("yi"), yo("yo"); + Halide::Func& top = node.dynamicCast()->funcs.back(); + + int outW, outH, outC, outN; + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + + top.reorder(x, c, y) + .split(y, yo, yi, 2) + .fuse(yo, n, tile) + .parallel(tile) + .unroll(yi) + .vectorize(x, outW >= 16 ? 16 : outW); +#endif // HAVE_HALIDE + } }; Ptr MaxUnpoolLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index aed3ffda3..9ca56cdac 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -41,6 +41,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "op_halide.hpp" #include #include using std::max; @@ -92,6 +93,14 @@ public: getConvPoolPaddings(inp, out, kernel, stride, padMode, pad); } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide() && + (type == PoolingLayer::MAX || + type == PoolingLayer::AVE && !pad.width && !pad.height); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { for (size_t ii = 0; ii < inputs.size(); ii++) @@ -111,6 +120,16 @@ public: } } + virtual Ptr initHalide(const std::vector > &inputs) + { + if (type == PoolingLayer::MAX) + return initMaxPoolingHalide(inputs); + else if (type == PoolingLayer::AVE) + return initAvePoolingHalide(inputs); + else + return Ptr(); + } + void maxPooling(Mat &src, Mat &dst, Mat &mask) { Size inp(src.size[3], src.size[2]), @@ -195,6 +214,120 @@ public: } } + virtual Ptr initMaxPoolingHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + Halide::Buffer inputBuffer = halideBuffer(inputs[0]); + const int inWidth = inputBuffer.width(); + const int inHeight = inputBuffer.height(); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::RDom r(0, kernel.width, 0, kernel.height); + Halide::Expr kx, ky; + if (pad.width || pad.height) + { + kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1); + ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1); + } + else + { + kx = min(x * stride.width + r.x, inWidth - 1); + ky = min(y * stride.height + r.y, inHeight - 1); + } + + // Halide::argmax returns tuple (r.x, r.y, max). + Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n)); + + // Compute offset from argmax in range [0, kernel_size). + Halide::Expr max_index; + if (pad.width || pad.height) + { + max_index = clamp(y * stride.height + res[1] - pad.height, + 0, inHeight - 1) * inWidth + + clamp(x * stride.width + res[0] - pad.width, + 0, inWidth - 1); + } + else + { + max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth + + min(x * stride.width + res[0], inWidth - 1); + } + top(x, y, c, n) = { res[2], Halide::cast(max_index) }; + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + + virtual Ptr initAvePoolingHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + Halide::Buffer inputBuffer = halideBuffer(inputs[0]); + + const int inW = inputBuffer.width(), inH = inputBuffer.height(); + if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height) + { + CV_Error(cv::Error::StsNotImplemented, + "Halide backend for average pooling with partial " + "kernels is not implemented"); + } + + const float norm = 1.0f / (kernel.width * kernel.height); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::RDom r(0, kernel.width, 0, kernel.height); + top(x, y, c, n) = sum( + inputBuffer(x * stride.width + r.x, + y * stride.height + r.y, c, n)) * norm; + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const + { +#ifdef HAVE_HALIDE + Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"), + xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co"); + Halide::Func& top = node.dynamicCast()->funcs.back(); + + int outW, outH, outC, outN; + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + + if (outW < 8 || outH < 8) + { + if (outC > 8) + top.split(c, co, ci, 8) + .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) + .parallel(tile) + .vectorize(ci); + else + { + top.fuse(y, c, tile).fuse(n, tile, tile) + .parallel(tile); + if (outW > 1) + top.vectorize(x); + } + } + else + { + if (outC > 8) + top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8) + .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile) + .parallel(tile) + .vectorize(xi); + else + top.split(x, xo, xi, 8).split(y, yo, yi, 8) + .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile) + .parallel(tile) + .vectorize(xi); + } +#endif // HAVE_HALIDE + } + bool getMemoryShapes(const std::vector &inputs, const int requiredOutputs, std::vector &outputs, diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index 473b1b38b..738b7d031 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -11,6 +11,7 @@ Implementation of Scale layer. #include "../precomp.hpp" #include "layers_common.hpp" +#include "op_halide.hpp" #include namespace cv @@ -36,6 +37,12 @@ public: return true; } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide(); + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { CV_Assert(blobs.size() == 1 + hasBias); @@ -65,6 +72,58 @@ public: } } + virtual Ptr tryAttach(const Ptr& node) + { + switch (node->backendId) + { + case DNN_BACKEND_HALIDE: + { +#ifdef HAVE_HALIDE + auto base = node.dynamicCast(); + Halide::Func& input = base->funcs.back(); + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = attachHalide(input(x, y, c, n)); + return Ptr(new HalideBackendNode(base, top)); +#endif // HAVE_HALIDE + break; + } + } + return Ptr(); + } + + virtual Ptr initHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + Halide::Buffer input = halideBuffer(inputs[0]); + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = attachHalide(input(x, y, c, n)); + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + +#ifdef HAVE_HALIDE + // attachHalide can work both with Halide::Buffer and Halide::Func. In the + // second case it will be a fusion. + Halide::Func attachHalide(const Halide::Expr& input) + { + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::Var x("x"), y("y"), c("c"), n("n"); + + const int numChannels = blobs[0].total(); + + auto weights = wrapToHalideBuffer(blobs[0], {numChannels}); + Halide::Expr topExpr = input * weights(c); + if (hasBias) + { + auto bias = wrapToHalideBuffer(blobs[1], {numChannels}); + topExpr += bias(c); + } + top(x, y, c, n) = topExpr; + return top; + } +#endif // HAVE_HALIDE + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 00dbc2672..a13fbff62 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -41,6 +41,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" +#include "op_halide.hpp" #include #include using std::max; @@ -74,6 +75,12 @@ public: return inplace; } + virtual bool supportBackend(int backendId) + { + return backendId == DNN_BACKEND_DEFAULT || + backendId == DNN_BACKEND_HALIDE && haveHalide() && axisRaw == 1; + } + void forward(std::vector &inputs, std::vector &outputs, std::vector &internals) { const Mat &src = *inputs[0]; @@ -155,6 +162,58 @@ public: } } + virtual Ptr initHalide(const std::vector > &inputs) + { +#ifdef HAVE_HALIDE + Halide::Buffer inputBuffer = halideBuffer(inputs[0]); + int inW, inH, inC, inN; + getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN); + + if (inW != 1 || inH != 1) + CV_Error(cv::Error::StsNotImplemented, + "Halide backend for SoftMax with spatial size " + "more than 1x1 is not implemented"); + + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + + Halide::Func expInput("expInput"); + Halide::RDom r(0, inW, 0, inH, 0, inC); + expInput(x, y, c, n) = exp(inputBuffer(x, y, c, n)); + Halide::Expr globalSum = sum(expInput(r.x, r.y, r.z, n)); + top(x, y, c, n) = expInput(x, y, c, n) / globalSum; + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + + virtual void applyHalideScheduler(Ptr& node, + const std::vector &inputs, + const std::vector &outputs) const + { +#ifdef HAVE_HALIDE + int outW, outH, outC, outN; + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + + // Most common case when SoftMax is a layer after fully-connected. + // So we just schedule it in the same way. + Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), tile("tile"); + Halide::Func& top = node.dynamicCast()->funcs.back(); + + if (outC + outN == 1) + return; + + if (outC > 8) + top.split(c, co, ci, 8) + .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) + .parallel(tile) + .vectorize(ci, 8); + else + top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile) + .parallel(tile); +#endif // HAVE_HALIDE + } + int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const { diff --git a/modules/dnn/src/layers/split_layer.cpp b/modules/dnn/src/layers/split_layer.cpp index 975230173..6242172d0 100644 --- a/modules/dnn/src/layers/split_layer.cpp +++ b/modules/dnn/src/layers/split_layer.cpp @@ -72,7 +72,7 @@ public: { CV_Assert(inputs.size() == 1); - Layer::getMemoryShapes(inputs, outputsCount >= 0 ? outputsCount : requiredOutputs, + Layer::getMemoryShapes(inputs, max(1, outputsCount >= 0 ? outputsCount : requiredOutputs), outputs, internals); return true; } @@ -81,6 +81,7 @@ public: { for (size_t i = 0; i < outputs.size(); i++) { + CV_Assert(inputs[0]->total() == outputs[i].total()); if (outputs[i].data != inputs[0]->data) inputs[0]->copyTo(outputs[i]); } diff --git a/modules/dnn/src/op_halide.cpp b/modules/dnn/src/op_halide.cpp new file mode 100644 index 000000000..ae256771e --- /dev/null +++ b/modules/dnn/src/op_halide.cpp @@ -0,0 +1,172 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. + +#include "op_halide.hpp" + +namespace cv +{ +namespace dnn +{ + +#ifdef HAVE_HALIDE +Halide::Buffer wrapToHalideBuffer(const Mat& mat) +{ + int n, c, w, h; + getCanonicalSize(mat.size, &w, &h, &c, &n); + return wrapToHalideBuffer(mat, {w, h, c, n}); +} + +Halide::Buffer wrapToHalideBuffer(const Mat& mat, + const std::vector& sizes) +{ + Halide::Buffer buffer((float*)mat.data, sizes); + buffer.set_host_dirty(); // Indicate that data is on CPU. + return buffer; +} + +Halide::Buffer<> halideBuffer(const Ptr& ptr) +{ + CV_Assert(!ptr.empty()); + return ptr.dynamicCast()->buffer; +} + +std::vector > halideBuffers(const std::vector >& ptrs) +{ + std::vector > vec; + vec.reserve(ptrs.size()); + for (const Ptr& ptr : ptrs) + { + vec.push_back(halideBuffer(ptr)); + } + return vec; +} + +void getCanonicalSize(const Halide::Buffer<>& buffer, int* width, int* height, + int* channels, int* batch) +{ + CV_Assert(buffer.dimensions() == 4); + *width = buffer.extent(0); + *height = buffer.extent(1); + *channels = buffer.extent(2); + *batch = buffer.extent(3); +} + +HalideBackendNode::HalideBackendNode(const Halide::Func& func) + : BackendNode(DNN_BACKEND_HALIDE), funcs(1, func) {} + +HalideBackendNode::HalideBackendNode(const std::vector& funcs) + : BackendNode(DNN_BACKEND_HALIDE), funcs(funcs) {} + +HalideBackendNode::HalideBackendNode(const Ptr& base, + const Halide::Func& top) + : BackendNode(DNN_BACKEND_HALIDE), funcs(base->funcs) +{ + funcs.back() = top; +} + +HalideBackendWrapper::HalideBackendWrapper(int targetId, const cv::Mat& m) + : BackendWrapper(DNN_BACKEND_HALIDE, targetId) +{ + buffer = wrapToHalideBuffer(m); + if (targetId != DNN_TARGET_CPU) + CV_Error(Error::StsNotImplemented, "Unknown target identifier"); +} + +HalideBackendWrapper::HalideBackendWrapper(const Ptr& base, + const MatShape& shape) + : BackendWrapper(DNN_BACKEND_HALIDE, base->targetId) +{ + if (base->targetId != DNN_TARGET_CPU) + CV_Error(Error::StsNotImplemented, "Unknown target identifier"); + + int w, h, c, n; + getCanonicalSize(shape, &w, &h, &c, &n); + Halide::Buffer baseBuffer = halideBuffer(base); + buffer = Halide::Buffer((float*)baseBuffer.raw_buffer()->host, + {w, h, c, n}); + buffer.set_host_dirty(); // Indicate that data is on CPU. +} +#endif // HAVE_HALIDE + +void getCanonicalSize(const MatSize& size, int* width, int* height, + int* channels, int* batch) +{ + const int dims = size.p[-1]; + CV_Assert(dims == 2 || dims == 4); + *batch = size[0]; + *channels = size[1]; + if (dims == 4) + { + *width = size[3]; + *height = size[2]; + } + else + { + *width = 1; + *height = 1; + } +} + +void getCanonicalSize(const MatShape& shape, int* width, int* height, + int* channels, int* batch) +{ + const int dims = shape.size(); + CV_Assert(dims == 2 || dims == 4); + *batch = shape[0]; + *channels = shape[1]; + if (dims == 4) + { + *width = shape[3]; + *height = shape[2]; + } + else + { + *width = 1; + *height = 1; + } +} + +void compileHalide(std::vector &outputs, Ptr& node, int targetId) +{ +#ifdef HAVE_HALIDE + CV_Assert(!node.empty()); + Halide::Func& top = node.dynamicCast()->funcs.back(); + + int outW, outH, outC, outN; + Halide::Var x("x"), y("y"), c("c"), n("n"); + getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); + top.bound(x, 0, outW).bound(y, 0, outH) + .bound(c, 0, outC).bound(n, 0, outN); + + Halide::Target target = Halide::get_host_target(); + target.set_feature(Halide::Target::NoAsserts); + top.compile_jit(target); +#endif // HAVE_HALIDE +} + +void forwardHalide(std::vector > &outputs, + const Ptr& node) +{ +#ifdef HAVE_HALIDE + CV_Assert(!node.empty()); + Halide::Func& top = node.dynamicCast()->funcs.back(); + auto outputBuffers = halideBuffers(outputs); + top.realize(Halide::Realization(outputBuffers)); +#endif // HAVE_HALIDE +} + +bool haveHalide() +{ +#ifdef HAVE_HALIDE + return true; +#else + return false; +#endif // HAVE_HALIDE +} + +} // namespace dnn +} // namespace cv diff --git a/modules/dnn/src/op_halide.hpp b/modules/dnn/src/op_halide.hpp new file mode 100644 index 000000000..4e60945db --- /dev/null +++ b/modules/dnn/src/op_halide.hpp @@ -0,0 +1,82 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. + +#ifndef __OPENCV_DNN_OP_HALIDE_HPP__ +#define __OPENCV_DNN_OP_HALIDE_HPP__ + +#include "precomp.hpp" + +#ifdef HAVE_HALIDE +#include +#endif // HAVE_HALIDE + +namespace cv +{ +namespace dnn +{ +#ifdef HAVE_HALIDE + // Returns four-dimensional buffer with float32 type that wrap cv::Mat data. + // No data copy here. + Halide::Buffer wrapToHalideBuffer(const Mat& mat); + + Halide::Buffer wrapToHalideBuffer(const Mat& mat, + const std::vector& shape); + + // Extract batch size, number of channels, width and height from buffer. + void getCanonicalSize(const Halide::Buffer<>& buffer, int* width, int* height, + int* channels, int* batch); + + // Cast pointer and create copy of Halide buffer. No data copy. + Halide::Buffer<> halideBuffer(const Ptr& ptr); + + std::vector > halideBuffers(const std::vector >& ptrs); + + class HalideBackendNode : public BackendNode + { + public: + HalideBackendNode(const Halide::Func& func); + + HalideBackendNode(const std::vector& funcs); + + // Initialize from the node but replace last function to . + // It's using in case of layers fusing when we want to keep functions of + // root layer but replace top by fused one (i.e. conv+padding to relu+padding). + HalideBackendNode(const Ptr& base, const Halide::Func& top); + + std::vector funcs; + }; + + class HalideBackendWrapper : public BackendWrapper + { + public: + HalideBackendWrapper(int targetId, const cv::Mat& m); + + HalideBackendWrapper(const Ptr& base, const MatShape& shape); + + Halide::Buffer buffer; + }; +#endif // HAVE_HALIDE + + // Extract batch size, number of channels, width and height from MatSize. + void getCanonicalSize(const MatSize& size, int* width, int* height, + int* channels, int* batch); + + void getCanonicalSize(const MatShape& shape, int* width, int* height, + int* channels, int* batch); + + // Realize Halide pipeline into output blobs. + void forwardHalide(std::vector > &outputs, + const Ptr& node); + + // Compile Halide pipeline to specific target. Use outputs to set bounds of functions. + void compileHalide(std::vector &outputs, Ptr& node, int targetId); + + bool haveHalide(); +} // namespace dnn +} // namespace cv + +#endif // __OPENCV_DNN_OP_HALIDE_HPP__ diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp index 36e16fd1b..f1a52e9b6 100644 --- a/modules/dnn/src/tensorflow/tf_importer.cpp +++ b/modules/dnn/src/tensorflow/tf_importer.cpp @@ -574,7 +574,6 @@ void TFImporter::populateNet(Net dstNet) { CV_Assert(layer.input_size() == 2); - layerParams.set("axis", 0); layerParams.set("bias_term", false); layerParams.blobs.resize(1); @@ -622,7 +621,6 @@ void TFImporter::populateNet(Net dstNet) } else if (type == "Softmax") { - layerParams.set("axis", -1); int id = dstNet.addLayer(name, "Softmax", layerParams); layer_id[name] = id; diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp new file mode 100644 index 000000000..db6b1c02c --- /dev/null +++ b/modules/dnn/test/test_halide_layers.cpp @@ -0,0 +1,637 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. + +// This tests doesn't require any external data. They just compare outputs of +// layers using different computation backends. Input and parameters are random. + +namespace cvtest +{ + +#ifdef HAVE_HALIDE +using namespace cv; +using namespace cv::dnn; +using namespace testing; + +static void test(LayerParams& params, Mat& input) +{ + randu(input, -1.0f, 1.0f); + + Net net; + int lid = net.addLayer(params.name, params.type, params); + net.connect(0, 0, lid, 0); + + net.setBlob("", input); + net.allocate(); + net.forward(); + Mat outputDefault = net.getBlob(params.name).clone(); + + net.setPreferableBackend(DNN_BACKEND_HALIDE); + net.forward(); + Mat outputHalide = net.getBlob(params.name).clone(); + normAssert(outputDefault, outputHalide); +} + +//////////////////////////////////////////////////////////////////////////////// +// Convolution +//////////////////////////////////////////////////////////////////////////////// +typedef TestWithParam > Convolution; +TEST_P(Convolution, Accuracy) +{ + int inChannels = get<0>(GetParam())[0]; + int outChannels = get<0>(GetParam())[1]; + int group = get<0>(GetParam())[2]; + Size inSize = get<1>(GetParam()); + Size kernel = get<2>(GetParam()); + Size stride = get<3>(GetParam()); + Size pad = get<4>(GetParam()); + Size dilation = get<5>(GetParam()); + bool hasBias = get<6>(GetParam()); + + Mat weights({outChannels, inChannels / group, kernel.height, kernel.width}, CV_32F); + randu(weights, -1.0f, 1.0f); + + LayerParams lp; + lp.set("kernel_w", kernel.width); + lp.set("kernel_h", kernel.height); + lp.set("pad_w", pad.width); + lp.set("pad_h", pad.height); + lp.set("stride_w", stride.width); + lp.set("stride_h", stride.height); + lp.set("dilation_w", dilation.width); + lp.set("dilation_h", dilation.height); + lp.set("num_output", outChannels); + lp.set("group", group); + lp.set("bias_term", hasBias); + lp.type = "Convolution"; + lp.name = "testLayer"; + lp.blobs.push_back(weights); + if (hasBias) + { + Mat bias({outChannels}, CV_32F); + randu(bias, -1.0f, 1.0f); + lp.blobs.push_back(bias); + } + Mat input({1, inChannels, inSize.height, inSize.width}, CV_32F); + test(lp, input); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Convolution, Combine( +/*in channels, out channels, group*/ + Values(Vec3i(6, 4, 1), Vec3i(6, 9, 1), + Vec3i(6, 4, 2), Vec3i(6, 9, 3)), +/*in size*/ Values(Size(5, 6)), +/*kernel*/ Values(Size(3, 1), Size(1, 3)), +/*stride*/ Values(Size(1, 1), Size(2, 2)), +/*pad*/ Values(Size(1, 0), Size(0, 1)), +/*dilation*/ Values(Size(1, 1), Size(2, 2)), +/*has bias*/ Bool() +)); + +//////////////////////////////////////////////////////////////////////////////// +// Deconvolution +//////////////////////////////////////////////////////////////////////////////// +typedef TestWithParam > Deconvolution; +TEST_P(Deconvolution, Accuracy) +{ + int inChannels = get<0>(GetParam())[0]; + int outChannels = get<0>(GetParam())[1]; + int group = get<0>(GetParam())[2]; + Size inSize = get<1>(GetParam()); + Size kernel = get<2>(GetParam()); + Size pad = get<3>(GetParam()); + Size dilation = get<4>(GetParam()); + Size stride = Size(get<5>(GetParam())[0], get<5>(GetParam())[1]); + Size adjPad = Size(get<5>(GetParam())[2], get<5>(GetParam())[3]); + bool hasBias = get<6>(GetParam()); + + Mat weights({outChannels, inChannels / group, kernel.height, kernel.width}, CV_32F); + randu(weights, -1.0f, 1.0f); + + LayerParams lp; + lp.set("kernel_w", kernel.width); + lp.set("kernel_h", kernel.height); + lp.set("pad_w", pad.width); + lp.set("pad_h", pad.height); + lp.set("stride_w", stride.width); + lp.set("stride_h", stride.height); + lp.set("dilation_w", dilation.width); + lp.set("dilation_h", dilation.height); + lp.set("adj_w", adjPad.width); + lp.set("adj_h", adjPad.height); + lp.set("num_output", outChannels); + lp.set("group", group); + lp.set("bias_term", hasBias); + lp.type = "Deconvolution"; + lp.name = "testLayer"; + lp.blobs.push_back(weights); + if (hasBias) + { + Mat bias({outChannels}, CV_32F); + randu(bias, -1.0f, 1.0f); + lp.blobs.push_back(bias); + } + Mat input({1, inChannels, inSize.height, inSize.width}, CV_32F); + test(lp, input); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Deconvolution, Combine( +/*in channels, out channels, group*/ + Values(Vec3i(6, 4, 1), Vec3i(6, 9, 1)), +/*in size*/ Values(Size(5, 6)), +/*kernel*/ Values(Size(3, 1), Size(1, 3)), +/*pad*/ Values(Size(1, 0), Size(0, 1)), +/*dilation*/ Values(Size(1, 1), Size(2, 2)), +/*stride, adj. pad*/ Values(Vec4i(1,1, 0,0), Vec4i(2,2, 1,0), Vec4i(1,2, 0,1)), +/*has bias*/ Bool() +)); + +//////////////////////////////////////////////////////////////////////////////// +// LRN +//////////////////////////////////////////////////////////////////////////////// +typedef TestWithParam > LRN; +TEST_P(LRN, Accuracy) +{ + int inChannels = get<0>(GetParam())[0]; + Size inSize = Size(get<0>(GetParam())[1], get<0>(GetParam())[2]); + int localSize = get<1>(GetParam()); + float alpha = get<2>(GetParam())[0]; + float beta = get<2>(GetParam())[1]; + float bias = get<2>(GetParam())[2]; + bool normBySize = get<3>(GetParam()); + std::string nrmType = get<4>(GetParam()); + + LayerParams lp; + lp.set("norm_region", nrmType); + lp.set("local_size", localSize); + lp.set("alpha", alpha); + lp.set("beta", beta); + lp.set("bias", bias); + lp.set("norm_by_size", normBySize); + lp.type = "LRN"; + lp.name = "testLayer"; + + Mat input({1, inChannels, inSize.height, inSize.width}, CV_32F); + test(lp, input); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, LRN, Combine( +/*input ch,w,h*/ Values(Vec3i(6, 5, 8), Vec3i(7, 11, 6)), +/*local size*/ Values(3, 5), + Values(Vec3f(0.9f, 1.0f, 1.1f), Vec3f(0.9f, 1.1f, 1.0f), +/*alpha, beta,*/ Vec3f(1.0f, 0.9f, 1.1f), Vec3f(1.0f, 1.1f, 0.9f), +/*bias */ Vec3f(1.1f, 0.9f, 1.0f), Vec3f(1.1f, 1.0f, 0.9f)), +/*norm_by_size*/ Bool(), +/*norm_type*/ Values("ACROSS_CHANNELS", "WITHIN_CHANNEL") +)); + +//////////////////////////////////////////////////////////////////////////////// +// Average pooling +//////////////////////////////////////////////////////////////////////////////// +typedef TestWithParam > AvePooling; +TEST_P(AvePooling, Accuracy) +{ + int inChannels = get<0>(GetParam()); + Size outSize = get<1>(GetParam());; // Input size will be computed from parameters. + Size kernel = get<2>(GetParam()); + Size stride = get<3>(GetParam()); + + const int inWidth = (outSize.width - 1) * stride.width + kernel.width; + const int inHeight = (outSize.height - 1) * stride.height + kernel.height; + + LayerParams lp; + lp.set("pool", "ave"); + lp.set("kernel_w", kernel.width); + lp.set("kernel_h", kernel.height); + lp.set("stride_w", stride.width); + lp.set("stride_h", stride.height); + lp.type = "Pooling"; + lp.name = "testLayer"; + + Mat input({1, inChannels, inHeight, inWidth}, CV_32F); + test(lp, input); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, AvePooling, Combine( +/*in channels*/ Values(3, 4), +/*out size*/ Values(Size(1, 1), Size(2, 2), Size(3, 2), Size(4, 7)), +/*kernel*/ Values(Size(1, 1), Size(2, 2), Size(3, 3), Size(3, 2)), +/*stride*/ Values(Size(1, 1), Size(2, 2), Size(3, 2)) +)); + +//////////////////////////////////////////////////////////////////////////////// +// Maximum pooling +//////////////////////////////////////////////////////////////////////////////// +typedef TestWithParam > MaxPooling; +TEST_P(MaxPooling, Accuracy) +{ + int inChannels = get<0>(GetParam()); + Size inSize = get<1>(GetParam()); + Size kernel = get<2>(GetParam()); + Size stride = get<3>(GetParam()); + Size pad = get<4>(GetParam()); + + LayerParams lp; + lp.set("pool", "max"); + lp.set("kernel_w", kernel.width); + lp.set("kernel_h", kernel.height); + lp.set("stride_w", stride.width); + lp.set("stride_h", stride.height); + lp.set("pad_w", pad.width); + lp.set("pad_h", pad.height); + lp.type = "Pooling"; + lp.name = "testLayer"; + + Mat input({1, inChannels, inSize.height, inSize.width}, CV_32F); + test(lp, input); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, MaxPooling, Combine( +/*in channels*/ Values(3, 4), +/*in size*/ Values(Size(5, 5), Size(7, 6)), +/*kernel*/ Values(Size(2, 2), Size(3, 3), Size(3, 2)), +/*stride*/ Values(Size(1, 1), Size(2, 2), Size(3, 2)), +/*pad*/ Values(Size(0, 0), Size(1, 1), Size(0, 1)) +)); + +//////////////////////////////////////////////////////////////////////////////// +// Fully-connected +//////////////////////////////////////////////////////////////////////////////// +typedef TestWithParam > FullyConnected; +TEST_P(FullyConnected, Accuracy) +{ + int inChannels = get<0>(GetParam()); + Size inSize = get<1>(GetParam()); + int outChannels = get<2>(GetParam()); + bool hasBias = get<3>(GetParam()); + + Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F); + randu(weights, -1.0f, 1.0f); + + Mat bias(1, outChannels, CV_32F); + randu(bias, -1.0f, 1.0f); + + LayerParams lp; + lp.set("num_output", outChannels); + lp.set("bias_term", hasBias); + lp.blobs.push_back(weights); + lp.blobs.push_back(bias); + lp.type = "InnerProduct"; + lp.name = "testLayer"; + + Mat input({1, inChannels, inSize.height, inSize.width}, CV_32F); + test(lp, input); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, FullyConnected, Combine( +/*in channels*/ Values(3, 4), +/*in size*/ Values(Size(5, 4), Size(4, 5), Size(1, 1)), +/*out channels*/ Values(3, 4), +/*has bias*/ Bool() +)); + +//////////////////////////////////////////////////////////////////////////////// +// SoftMax +//////////////////////////////////////////////////////////////////////////////// +typedef TestWithParam > SoftMax; +TEST_P(SoftMax, Accuracy) +{ + int inChannels = get<0>(GetParam()); + LayerParams lp; + lp.type = "SoftMax"; + lp.name = "testLayer"; + + Mat input({1, inChannels, 1, 1}, CV_32F); + test(lp, input); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, SoftMax, Values(3, 4, 5, 1024)); + +////////////////////////////////////////////////////////////////////////////// +// Max pooling - unpooling +////////////////////////////////////////////////////////////////////////////// +TEST(MaxPoolUnpool_Halide, Accuracy) +{ + LayerParams pool; + pool.set("pool", "max"); + pool.set("kernel_w", 2); + pool.set("kernel_h", 2); + pool.set("stride_w", 2); + pool.set("stride_h", 2); + pool.set("pad_w", 0); + pool.set("pad_h", 0); + pool.type = "Pooling"; + pool.name = "testPool"; + + LayerParams unpool; + unpool.set("pool_k_w", 2); + unpool.set("pool_k_h", 2); + unpool.set("pool_stride_w", 2); + unpool.set("pool_stride_h", 2); + unpool.set("pool_pad_w", 0); + unpool.set("pool_pad_h", 0); + unpool.type = "MaxUnpool"; + unpool.name = "testUnpool"; + + Net net; + int poolId = net.addLayer(pool.name, pool.type, pool); + net.connect(0, 0, poolId, 0); + + int unpoolId = net.addLayer(unpool.name, unpool.type, unpool); + net.connect(poolId, 0, unpoolId, 0); + net.connect(poolId, 1, unpoolId, 1); + + Mat input({1, 1, 4, 4}, CV_32F); + randu(input, -1.0f, 1.0f); + net.setBlob("", input); + net.forward(); + Mat outputDefault = net.getBlob("testUnpool").clone(); + + net.setPreferableBackend(DNN_BACKEND_HALIDE); + net.setBlob("", input); + net.forward(); + Mat outputHalide = net.getBlob("testUnpool").clone(); + normAssert(outputDefault, outputHalide); +} + +//////////////////////////////////////////////////////////////////////////////// +// AvePooling + in-place layers +//////////////////////////////////////////////////////////////////////////////// +static const int kNumChannels = 3; + +void testInPlaceActivation(LayerParams& lp) +{ + EXPECT_FALSE(lp.name.empty()); + + LayerParams pool; + pool.set("pool", "ave"); + pool.set("kernel_w", 2); + pool.set("kernel_h", 2); + pool.set("stride_w", 2); + pool.set("stride_h", 2); + pool.type = "Pooling"; + + Net net; + int poolId = net.addLayer(pool.name, pool.type, pool); + net.connect(0, 0, poolId, 0); + net.addLayerToPrev(lp.name, lp.type, lp); + + Mat input({1, kNumChannels, 10, 10}, CV_32F); + randu(input, -1.0f, 1.0f); + net.setBlob("", input); + net.forward(); + Mat outputDefault = net.getBlob(lp.name).clone(); + + net.setBlob("", input); + net.setPreferableBackend(DNN_BACKEND_HALIDE); + net.forward(); + Mat outputHalide = net.getBlob(lp.name).clone(); + normAssert(outputDefault, outputHalide); +} + +typedef TestWithParam > BatchNorm; +TEST_P(BatchNorm, Accuracy) +{ + bool hasWeights = get<0>(GetParam()); + bool hasBias = get<1>(GetParam()); + float epsilon = get<2>(GetParam()); + + LayerParams lp; + lp.set("has_weight", hasWeights); + lp.set("has_bias", hasBias); + lp.set("eps", epsilon); + lp.type = "BatchNorm"; + lp.name = "testLayer"; + + lp.blobs.reserve(4); + for (int i = 0; i < 3; ++i) + lp.blobs.push_back(Mat({kNumChannels}, CV_32F)); + if (hasBias || hasWeights) + lp.blobs.push_back(Mat({kNumChannels}, CV_32F)); + + for (Mat& m : lp.blobs) + randu(m, 0.0f, 1.0f); + + testInPlaceActivation(lp); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, BatchNorm, Combine( +/*has weights*/ Bool(), +/*has bias*/ Bool(), +/*epsilon*/ Values(1e-3f, 1e-5f) +)); + +typedef TestWithParam > ReLU; +TEST_P(ReLU, Accuracy) +{ + float negativeSlope = get<0>(GetParam()); + + LayerParams lp; + lp.set("negative_slope", negativeSlope); + lp.type = "ReLU"; + lp.name = "testLayer"; + testInPlaceActivation(lp); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, ReLU, Values( +/*negative slope*/ 2.0f, 0.3f, -0.1f +)); + +typedef TestWithParam > NoParamActivation; +TEST_P(NoParamActivation, Accuracy) +{ + LayerParams lp; + lp.type = get<0>(GetParam()); + lp.name = "testLayer"; + testInPlaceActivation(lp); +} +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, NoParamActivation, Values( +/*type*/ "TanH", "Sigmoid", "AbsVal", "BNLL" +)); + +typedef TestWithParam > Power; +TEST_P(Power, Accuracy) +{ + float power = get<0>(GetParam())[0]; + float scale = get<0>(GetParam())[1]; + float shift = get<0>(GetParam())[2]; + + LayerParams lp; + lp.set("power", power); + lp.set("scale", scale); + lp.set("shift", shift); + lp.type = "Power"; + lp.name = "testLayer"; + testInPlaceActivation(lp); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Power, +/*power, scale, shift*/ Values(Vec3f(0.9f, 1.0f, 1.1f), Vec3f(0.9f, 1.1f, 1.0f), + Vec3f(1.0f, 0.9f, 1.1f), Vec3f(1.0f, 1.1f, 0.9f), + Vec3f(1.1f, 0.9f, 1.0f), Vec3f(1.1f, 1.0f, 0.9f)) +); + +TEST(ChannelsPReLU, Accuracy) +{ + LayerParams lp; + lp.type = "ChannelsPReLU"; + lp.name = "testLayer"; + lp.blobs.push_back(Mat({kNumChannels}, CV_32F)); + randu(lp.blobs[0], -1.0f, 1.0f); + + testInPlaceActivation(lp); +} + +typedef TestWithParam > Scale; +TEST_P(Scale, Accuracy) +{ + bool hasBias = get<0>(GetParam()); + + LayerParams lp; + lp.set("bias_term", hasBias); + lp.type = "Scale"; + lp.name = "testLayer"; + lp.blobs.push_back(Mat({kNumChannels}, CV_32F)); + randu(lp.blobs[0], -1.0f, 1.0f); + if (hasBias) + { + lp.blobs.push_back(Mat({kNumChannels}, CV_32F)); + randu(lp.blobs[1], -1.0f, 1.0f); + } + testInPlaceActivation(lp); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Scale, Values(true, false)); + +//////////////////////////////////////////////////////////////////////////////// +// Concat layer +//////////////////////////////////////////////////////////////////////////////// +// +// input --- conv --- concat --- output +// `--- conv ----^ ^ ^ +// `---- ... ------' ' +// `-----------------' +typedef TestWithParam > Concat; +TEST_P(Concat, Accuracy) +{ + Vec3i inSize = get<0>(GetParam()); + Vec3i numChannels = get<1>(GetParam()); + + Net net; + + LayerParams concatParam; + concatParam.type = "Concat"; + concatParam.name = "testLayer"; + int concatId = net.addLayer(concatParam.name, concatParam.type, concatParam); + net.connect(0, 0, concatId, 0); + + for (int i = 0, n = numChannels.channels; i < n; ++i) + { + if (!numChannels[i]) + break; + + Mat weights({numChannels[i], inSize[0], 1, 1}, CV_32F); + randu(weights, -1.0f, 1.0f); + + LayerParams convParam; + convParam.set("kernel_w", 1); + convParam.set("kernel_h", 1); + convParam.set("num_output", numChannels[i]); + convParam.set("bias_term", false); + convParam.type = "Convolution"; + std::ostringstream ss; + ss << "convLayer" << i; + convParam.name = ss.str(); + convParam.blobs.push_back(weights); + + int convId = net.addLayer(convParam.name, convParam.type, convParam); + net.connect(0, 0, convId, 0); + net.connect(convId, 0, concatId, i + 1); + } + + Mat input({1, inSize[0], inSize[1], inSize[2]}, CV_32F); + randu(input, -1.0f, 1.0f); + + net.setBlob("", input); + net.forward(); + Mat outputDefault = net.getBlob(concatParam.name).clone(); + + net.setPreferableBackend(DNN_BACKEND_HALIDE); + net.forward(); + Mat outputHalide = net.getBlob(concatParam.name).clone(); + normAssert(outputDefault, outputHalide); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Concat, Combine( +/*input size*/ Values(Vec3i(1, 4, 5), Vec3i(2, 8, 6)), +/*channels*/ Values(Vec3i(2, 0, 0), Vec3i(3, 4, 0), Vec3i(1, 6, 2)) +)); + +//////////////////////////////////////////////////////////////////////////////// +// Element-wise layers +//////////////////////////////////////////////////////////////////////////////// +// +// input --- conv --- eltwise --- output +// `--- conv ----^ ^ ^ +// `---- ... ------' ' +// `-----------------' +typedef TestWithParam > Eltwise; +TEST_P(Eltwise, Accuracy) +{ + Vec3i inSize = get<0>(GetParam()); + std::string op = get<1>(GetParam()); + int numConv = get<2>(GetParam()); + + Net net; + + LayerParams eltwiseParam; + eltwiseParam.type = "Eltwise"; + eltwiseParam.name = "testLayer"; + int eltwiseId = net.addLayer(eltwiseParam.name, eltwiseParam.type, eltwiseParam); + net.connect(0, 0, eltwiseId, 0); + + for (int i = 0; i < numConv; ++i) + { + Mat weights({inSize[0], inSize[0], 1, 1}, CV_32F); + randu(weights, -1.0f, 1.0f); + + LayerParams convParam; + convParam.set("kernel_w", 1); + convParam.set("kernel_h", 1); + convParam.set("num_output", inSize[0]); + convParam.set("bias_term", false); + convParam.type = "Convolution"; + std::ostringstream ss; + ss << "convLayer" << i; + convParam.name = ss.str(); + convParam.blobs.push_back(weights); + + int convId = net.addLayer(convParam.name, convParam.type, convParam); + net.connect(0, 0, convId, 0); + net.connect(convId, 0, eltwiseId, i + 1); + } + + Mat input({1, inSize[0], inSize[1], inSize[2]}, CV_32F); + randu(input, -1.0f, 1.0f); + + net.setBlob("", input); + net.forward(); + Mat outputDefault = net.getBlob(eltwiseParam.name).clone(); + + net.setPreferableBackend(DNN_BACKEND_HALIDE); + net.forward(); + Mat outputHalide = net.getBlob(eltwiseParam.name).clone(); + normAssert(outputDefault, outputHalide); +} + +INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, Eltwise, Combine( +/*input size*/ Values(Vec3i(1, 4, 5), Vec3i(2, 8, 6)), +/*operation*/ Values("prod", "sum", "max"), +/*num convs*/ Values(1, 2, 3) +)); +#endif // HAVE_HALIDE + +} // namespace cvtest diff --git a/modules/dnn/test/test_halide_nets.cpp b/modules/dnn/test/test_halide_nets.cpp new file mode 100644 index 000000000..7adc78eb4 --- /dev/null +++ b/modules/dnn/test/test_halide_nets.cpp @@ -0,0 +1,124 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. + +namespace cvtest +{ + +#ifdef HAVE_HALIDE +using namespace cv; +using namespace dnn; + +static void loadNet(const std::string& weights, const std::string& proto, + const std::string& framework, Net* net) +{ + if (framework == "caffe") + { + *net = cv::dnn::readNetFromCaffe(proto, weights); + } + else if (framework == "torch") + { + *net = cv::dnn::readNetFromTorch(weights); + } + else if (framework == "tensorflow") + { + *net = cv::dnn::readNetFromTensorflow(weights); + } + else + CV_Error(Error::StsNotImplemented, "Unknown framework " + framework); +} + +static void test(const std::string& weights, const std::string& proto, + const std::string& scheduler, int inWidth, int inHeight, + const std::string& outputLayer, const std::string& framework, + int targetId) +{ + Mat input(inHeight, inWidth, CV_32FC3), outputDefault, outputHalide; + randu(input, 0.0f, 1.0f); + + Net netDefault, netHalide; + loadNet(weights, proto, framework, &netDefault); + loadNet(weights, proto, framework, &netHalide); + + netDefault.setBlob("", blobFromImage(input.clone(), 1.0f, false)); + netDefault.forward(netDefault.getLayerId(outputLayer)); + outputDefault = netDefault.getBlob(outputLayer).clone(); + + netHalide.setBlob("", blobFromImage(input.clone(), 1.0f, false)); + netHalide.setPreferableBackend(DNN_BACKEND_HALIDE); + netHalide.compileHalide(scheduler); + netHalide.forward(netHalide.getLayerId(outputLayer)); + outputHalide = netHalide.getBlob(outputLayer).clone(); + + normAssert(outputDefault, outputHalide); + + // An extra test: change input. + input *= 0.1f; + netDefault.setBlob("", blobFromImage(input.clone(), 1.0, false)); + netHalide.setBlob("", blobFromImage(input.clone(), 1.0, false)); + + normAssert(outputDefault, outputHalide); + + // Swap backends. + netHalide.setPreferableBackend(DNN_BACKEND_DEFAULT); + netHalide.forward(netHalide.getLayerId(outputLayer)); + + netDefault.setPreferableBackend(DNN_BACKEND_HALIDE); + netDefault.compileHalide(scheduler); + netDefault.forward(netDefault.getLayerId(outputLayer)); + + outputDefault = netHalide.getBlob(outputLayer).clone(); + outputHalide = netDefault.getBlob(outputLayer).clone(); + normAssert(outputDefault, outputHalide); +} + +TEST(Reproducibility_GoogLeNet_Halide, Accuracy) +{ + test(findDataFile("dnn/bvlc_googlenet.caffemodel"), + findDataFile("dnn/bvlc_googlenet.prototxt"), + "", 227, 227, "prob", "caffe", DNN_TARGET_CPU); +}; + +TEST(Reproducibility_AlexNet_Halide, Accuracy) +{ + test(getOpenCVExtraDir() + "/dnn/bvlc_alexnet.caffemodel", + getOpenCVExtraDir() + "/dnn/bvlc_alexnet.prototxt", + getOpenCVExtraDir() + "/dnn/halide_scheduler_alexnet.yml", + 227, 227, "prob", "caffe", DNN_TARGET_CPU); +}; + +// TEST(Reproducibility_ResNet_50_Halide, Accuracy) +// { +// test(getOpenCVExtraDir() + "/dnn/ResNet-50-model.caffemodel", +// getOpenCVExtraDir() + "/dnn/ResNet-50-deploy.prototxt", +// getOpenCVExtraDir() + "/dnn/halide_scheduler_resnet_50.yml", +// 224, 224, "prob", "caffe", DNN_TARGET_CPU); +// }; + +// TEST(Reproducibility_SqueezeNet_v1_1_Halide, Accuracy) +// { +// test(getOpenCVExtraDir() + "/dnn/squeezenet_v1_1.caffemodel", +// getOpenCVExtraDir() + "/dnn/squeezenet_v1_1.prototxt", +// getOpenCVExtraDir() + "/dnn/halide_scheduler_squeezenet_v1_1.yml", +// 227, 227, "prob", "caffe", DNN_TARGET_CPU); +// }; + +TEST(Reproducibility_Inception_5h_Halide, Accuracy) +{ + test(getOpenCVExtraDir() + "/dnn/tensorflow_inception_graph.pb", "", + getOpenCVExtraDir() + "/dnn/halide_scheduler_inception_5h.yml", + 224, 224, "softmax2", "tensorflow", DNN_TARGET_CPU); +}; + +TEST(Reproducibility_ENet_Halide, Accuracy) +{ + test(getOpenCVExtraDir() + "/dnn/Enet-model-best.net", "", + getOpenCVExtraDir() + "/dnn/halide_scheduler_enet.yml", + 512, 512, "l367_Deconvolution", "torch", DNN_TARGET_CPU); +}; +#endif // HAVE_HALIDE + +} // namespace cvtest diff --git a/modules/dnn/tutorials/tutorial_dnn_halide.markdown b/modules/dnn/tutorials/tutorial_dnn_halide.markdown new file mode 100644 index 000000000..be3145137 --- /dev/null +++ b/modules/dnn/tutorials/tutorial_dnn_halide.markdown @@ -0,0 +1,135 @@ +# How to enable Halide backend for improve efficiency {#tutorial_dnn_halide} + +## Introduction +This tutorial guidelines how to run your models in OpenCV deep learning module +using Halide language backend. Halide is an open-source project that let us +write image processing algorithms in well-readable format, schedule computations +according to specific device and evaluate it with a quite good efficiency. + +An official website of the Halide project: http://halide-lang.org/. + +## Efficiency comparison +Measured on Intel® Core™ i7-6700K CPU @ 4.00GHz x 8. + +Single image forward pass (in milliseconds): + +| Architecture | MKL backend | Halide backend | Speed Up ratio | +|-----------------:|------------:|---------------:|---------------:| +| AlexNet | 16.55 | 22.38 | x0.73 | +| ResNet-50 | 63.69 | 73.91 | x0.86 | +| SqueezeNet v1.1 | 10.11 | 8.21 | x1.23 | +| Inception-5h | 35.38 | 37.06 | x0.95 | +| ENet @ 3x512x256 | 82.26 | 41.21 | x1.99 | + +Scheduling directives might be found @ [opencv_extra/testdata/dnn](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn). + +## Requirements +### LLVM compiler + +@note LLVM compilation might take a long time. + +- Download LLVM source code from http://releases.llvm.org/4.0.0/llvm-4.0.0.src.tar.xz. +Unpack it. Let **llvm_root** is a root directory of source code. + +- Create directory **llvm_root**/tools/clang + +- Download Clang with the same version as LLVM. In our case it will be from +http://releases.llvm.org/4.0.0/cfe-4.0.0.src.tar.xz. Unpack it into +**llvm_root**/tools/clang. Note that it should be a root for Clang source code. + +- Build LLVM on Linux +@code +cd llvm_root +mkdir build && cd build +cmake -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_TARGETS_TO_BUILD="X86" -DLLVM_ENABLE_ASSERTIONS=ON -DCMAKE_BUILD_TYPE=Release .. +make -j4 +@endcode + +- Build LLVM on Windows (Developer Command Prompt) +@code +mkdir \\path-to-llvm-build\\ && cd \\path-to-llvm-build\\ +cmake.exe -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_ENABLE_ASSERTIONS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\\path-to-llvm-install\\ -G "Visual Studio 14 Win64" \\path-to-llvm-src\\ +MSBuild.exe /m:4 /t:Build /p:Configuration=Release .\\INSTALL.vcxproj +@endcode + +@note `\\path-to-llvm-build\\` and `\\path-to-llvm-install\\` are different directories. + +### Halide language. + +- Download source code from GitHub repository, https://github.com/halide/Halide +or using git. The root directory will be a **halide_root**. +@code +git clone https://github.com/halide/Halide.git +@endcode + +- Build Halide on Linux +@code +cd halide_root +mkdir build && cd build +cmake -DLLVM_DIR=llvm_root/build/lib/cmake/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_VERSION=40 -DWITH_TESTS=OFF -DWITH_APPS=OFF -DWITH_TUTORIALS=OFF .. +make -j4 +@endcode + +- Build Halide on Windows (Developer Command Prompt) +@code +cd halide_root +mkdir build && cd build +cmake.exe -DLLVM_DIR=\\path-to-llvm-install\\lib\\cmake\\llvm -DLLVM_VERSION=40 -DWITH_TESTS=OFF -DWITH_APPS=OFF -DWITH_TUTORIALS=OFF -DCMAKE_BUILD_TYPE=Release -G "Visual Studio 14 Win64" .. +MSBuild.exe /m:4 /t:Build /p:Configuration=Release .\\ALL_BUILD.vcxproj +@endcode + +## Build OpenCV with Halide backend +When you build OpenCV add the following configuration flags: + +- `WITH_HALIDE` - enable Halide linkage + +- `HALIDE_ROOT_DIR` - path to Halide build directory + +How to build OpenCV with DNN module you may find in @ref tutorial_dnn_build. + +## Sample + +@include dnn/samples/squeezenet_halide.cpp + +## Explanation +Download Caffe model from SqueezeNet repository: [train_val.prototxt](https://github.com/DeepScale/SqueezeNet/blob/master/SqueezeNet_v1.1/train_val.prototxt) and [squeezenet_v1.1.caffemodel](https://github.com/DeepScale/SqueezeNet/blob/master/SqueezeNet_v1.1/squeezenet_v1.1.caffemodel). + +Also you need file with names of [ILSVRC2012](http://image-net.org/challenges/LSVRC/2012/browse-synsets) classes: +[synset_words.txt](https://raw.githubusercontent.com/ludv1x/opencv_contrib/master/modules/dnn/samples/synset_words.txt). + +Put these files into working dir of this program example. + +-# Read and initialize network using path to .prototxt and .caffemodel files +@snippet dnn/samples/squeezenet_halide.cpp Read and initialize network + +-# Check that network was read successfully +@snippet dnn/samples/squeezenet_halide.cpp Check that network was read successfully + +-# Read input image and convert to the 4-dimensional blob, acceptable by SqueezeNet v1.1 +@snippet dnn/samples/squeezenet_halide.cpp Prepare blob + +-# Pass the blob to the network +@snippet dnn/samples/squeezenet_halide.cpp Set input blob + +-# Enable using Halide backend for layers where it is implemented +@snippet dnn/samples/squeezenet_halide.cpp Enable Halide backend + +-# Compile Halide functions to execute on CPU +@snippet dnn/samples/squeezenet_halide.cpp Compile Halide pipeline + +-# Make forward pass +@snippet dnn/samples/squeezenet_halide.cpp Make forward pass +Remember that the first forward pass after initialization require quite more +time that the next ones. It's because of runtime compilation of Halide pipelines +at the first invocation. + +-# Determine the best class +@snippet dnn/samples/squeezenet_halide.cpp Gather output + +-# Print results +@snippet dnn/samples/squeezenet_halide.cpp Print results +For our image we get: + +> Best class: #812 'space shuttle' +> +> Probability: 97.9812% diff --git a/modules/dnn/tutorials/tutorial_dnn_halide_scheduling.markdown b/modules/dnn/tutorials/tutorial_dnn_halide_scheduling.markdown new file mode 100644 index 000000000..d40846e20 --- /dev/null +++ b/modules/dnn/tutorials/tutorial_dnn_halide_scheduling.markdown @@ -0,0 +1,83 @@ +# How to schedule your network for Halide backend {#tutorial_dnn_halide_scheduling} + +## Introduction +Halide code is the same for every device we use. But for achieving the satisfied +efficiency we should schedule computations properly. In this tutorial we describe +the ways to schedule your networks using Halide backend in OpenCV deep learning module. + +For better understanding of Halide scheduling you might want to read tutorials @ http://halide-lang.org/tutorials. + +If it's your first meeting with Halide in OpenCV, we recommend to start from @ref tutorial_dnn_halide. + +## Configuration files +When you call ```cv::dnn::Net::compileHalide```, you can pass a path to textual file +contains scheduling directives for specific device. + +Scheduling configuration files represented as YAML files where each node is a +scheduled function or a scheduling directive. +@code +relu1: + reorder: [x, c, y] + split: { y: 2, c: 8 } + parallel: [yo, co] + unroll: yi + vectorize: { x: 4 } +conv1_constant_exterior: + compute_at: { relu1: yi } +@endcode + +Considered use variables `n` for batch dimension, `c` for channels, +`y` for rows and `x` for columns. For variables after split are used names +with the same prefix but `o` and `i` suffixes for outer and inner variables +correspondingly. In example, for variable `x` in range `[0, 10)` directive +`split: { x: 2 }` gives new ones `xo` in range `[0, 5)` and `xi` in range `[0, 2)`. +Variable name `x` is no longer available in the same scheduling node. + +You can find scheduling examples at [opencv_extra/testdata/dnn](https://github.com/opencv/opencv_extra/tree/master/testdata/dnn) +and use it for schedule your networks. + +## Layers fusing +Thanks to layers fusing we can schedule only the top layers of fused sets. +Because for every output value we use the fused formula. +In example, if you have three layers Convolution + Scale + ReLU one by one, +@code +conv(x, y, c, n) = sum(...) + bias(c); +scale(x, y, c, n) = conv(x, y, c, n) * weights(c); +relu(x, y, c, n) = max(scale(x, y, c, n), 0); +@endcode + +fused function is something like +@code +relu(x, y, c, n) = max((sum(...) + bias(c)) * weights(c), 0); +@endcode + +So only function called `relu` require scheduling. + +## Scheduling patterns +Sometimes networks built using blocked structure that means some layer are +identical or quite similar. If you want to apply the same scheduling for +different layers accurate to tiling or vectorization factors, define scheduling +patterns in section `patterns` at the beginning of scheduling file. +Also, your patters may use some parametric variables. +@code +# At the beginning of the file +patterns: + fully_connected: + split: { c: c_split } + fuse: { src: [x, y, co], dst: block } + parallel: block + vectorize: { ci: c_split } +# Somewhere below +fc8: + pattern: fully_connected + params: { c_split: 8 } +@endcode + +## Automatic scheduling +Based on manual scheduling experience, proposed way to schedule layers +automatically. Just skip scheduling file path argument at ```cv::dnn::Net::compileHalide``` +for let DNN schedule your network. Sometimes it might be even better +than manual scheduling. + +You can mix both manual and automatic scheduling ways. Write scheduling file +and skip layers that you want to be scheduled automatically.