Open Source Computer Vision Library
https://opencv.org/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
429 lines
13 KiB
429 lines
13 KiB
// This file is part of OpenCV project. |
|
// It is subject to the license terms in the LICENSE file found in the top-level directory |
|
// of this distribution and at http://opencv.org/license.html. |
|
// |
|
// Copyright (C) 2017, Intel Corporation, all rights reserved. |
|
// Third party copyrights are property of their respective owners. |
|
|
|
#include "precomp.hpp" |
|
#include <opencv2/dnn/shape_utils.hpp> |
|
#include "op_halide.hpp" |
|
#include "net_impl.hpp" |
|
|
|
#ifdef HAVE_HALIDE |
|
#include "halide_scheduler.hpp" |
|
|
|
#include <HalideRuntimeOpenCL.h> |
|
#include <thread> |
|
#endif // HAVE_HALIDE |
|
|
|
namespace cv { |
|
namespace dnn { |
|
CV__DNN_INLINE_NS_BEGIN |
|
|
|
|
|
void Net::Impl::setHalideScheduler(const String& scheduler) |
|
{ |
|
halideConfigFile = scheduler; |
|
} |
|
|
|
|
|
#ifdef HAVE_HALIDE |
|
|
|
|
|
void Net::Impl::compileHalide() |
|
{ |
|
CV_TRACE_FUNCTION(); |
|
|
|
CV_Assert(preferableBackend == DNN_BACKEND_HALIDE); |
|
|
|
HalideScheduler scheduler(halideConfigFile); |
|
std::vector< std::reference_wrapper<LayerData> > compileList; compileList.reserve(64); |
|
for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it) |
|
{ |
|
LayerData& ld = it->second; |
|
Ptr<Layer> layer = ld.layerInstance; |
|
if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip) |
|
{ |
|
CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty()); |
|
bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]); |
|
if (!scheduled) |
|
{ |
|
// Use automatic scheduling provided by layer. |
|
layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE], |
|
ld.inputBlobs, ld.outputBlobs, |
|
preferableTarget); |
|
} |
|
compileList.emplace_back(ld); |
|
} |
|
} |
|
std::atomic<int> progress(0); |
|
auto fn = ([&] () -> void |
|
{ |
|
for (;;) |
|
{ |
|
int id = progress.fetch_add(1); |
|
if ((size_t)id >= compileList.size()) |
|
return; |
|
const LayerData& ld = compileList[id].get(); |
|
Ptr<BackendNode> node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second; |
|
dnn::compileHalide(ld.outputBlobs, node, preferableTarget); |
|
} |
|
}); |
|
size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency()); |
|
num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads)); |
|
std::vector<std::thread> threads(num_threads - 1); |
|
for (auto& t: threads) t = std::thread(fn); |
|
fn(); // process own tasks |
|
for (auto& t: threads) t.join(); |
|
} |
|
|
|
|
|
void Net::Impl::initHalideBackend() |
|
{ |
|
CV_TRACE_FUNCTION(); |
|
CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide()); |
|
|
|
// Iterator to current layer. |
|
MapIdToLayerData::iterator it = layers.begin(); |
|
// Iterator to base layer for fusion. In example, in case of conv+bn+relu |
|
// it'll be a conv layer. |
|
MapIdToLayerData::iterator baseIt = layers.begin(); |
|
for (; it != layers.end(); it++) |
|
{ |
|
LayerData &ldTop = it->second; |
|
Ptr<Layer> layerTop = ldTop.layerInstance; |
|
if (!layerTop->supportBackend(preferableBackend)) |
|
{ |
|
// Move base iterator to layer that don't support preferable |
|
// backend to prevent fusion over layer of different backend. |
|
baseIt = it; |
|
continue; |
|
} |
|
// Try to do layers fusion. |
|
LayerData &ldBot = baseIt->second; |
|
Ptr<Layer> layerBot = ldBot.layerInstance; |
|
// 1. Check that bottom and top from the same backends. |
|
if (it != layers.begin() && layerBot->supportBackend(preferableBackend)) |
|
{ |
|
// 2. Check that current layer works in-place. |
|
bool inPlace = ldTop.inputBlobs.size() == 1 && |
|
ldBot.outputBlobs.size() == 1 && |
|
ldTop.inputBlobs[0]->data == |
|
ldBot.outputBlobs[0].data; |
|
if (inPlace) |
|
{ |
|
// 3. Try to attach node. |
|
CV_Assert(!ldBot.backendNodes[preferableBackend].empty()); |
|
Ptr<BackendNode> fusedNode = |
|
layerTop->tryAttach(ldBot.backendNodes[preferableBackend]); |
|
if (!fusedNode.empty()) |
|
{ |
|
ldTop.skip = true; |
|
ldBot.backendNodes[preferableBackend] = fusedNode; |
|
ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers; |
|
continue; |
|
} |
|
} |
|
} |
|
// No layers fusion. |
|
ldTop.skip = false; |
|
ldTop.backendNodes[DNN_BACKEND_HALIDE] = |
|
layerTop->initHalide(ldTop.inputBlobsWrappers); |
|
baseIt = it; |
|
} |
|
} |
|
|
|
|
|
#endif // HAVE_HALIDE |
|
CV__DNN_INLINE_NS_END |
|
|
|
|
|
#ifdef HAVE_HALIDE |
|
static MatShape getBufferShape(const MatShape& shape) |
|
{ |
|
if (shape.size() == 2 || shape.size() == 4) |
|
{ |
|
int w, h, c, n; |
|
getCanonicalSize(shape, &w, &h, &c, &n); |
|
return {w, h, c, n}; |
|
} |
|
else |
|
{ |
|
MatShape bufferShape(shape); |
|
std::reverse(bufferShape.begin(), bufferShape.end()); |
|
return bufferShape; |
|
} |
|
} |
|
|
|
static MatShape getBufferShape(const MatSize& size) |
|
{ |
|
return getBufferShape(shape(size)); |
|
} |
|
|
|
Halide::Buffer<float> wrapToHalideBuffer(const Mat& mat) |
|
{ |
|
return wrapToHalideBuffer(mat, getBufferShape(mat.size)); |
|
} |
|
|
|
Halide::Buffer<float> wrapToHalideBuffer(const Mat& mat, |
|
const std::vector<int>& sizes) |
|
{ |
|
Halide::Buffer<float> buffer((float*)mat.data, sizes); |
|
buffer.set_host_dirty(); // Indicate that data is on CPU. |
|
return buffer; |
|
} |
|
|
|
Halide::Buffer<> halideBuffer(const Ptr<BackendWrapper>& ptr) |
|
{ |
|
CV_Assert(!ptr.empty()); |
|
return ptr.dynamicCast<HalideBackendWrapper>()->buffer; |
|
} |
|
|
|
std::vector<Halide::Buffer<> > halideBuffers(const std::vector<Ptr<BackendWrapper> >& ptrs) |
|
{ |
|
std::vector<Halide::Buffer<> > vec; |
|
vec.reserve(ptrs.size()); |
|
for (const Ptr<BackendWrapper>& ptr : ptrs) |
|
{ |
|
vec.push_back(halideBuffer(ptr)); |
|
} |
|
return vec; |
|
} |
|
|
|
void getCanonicalSize(const Halide::Buffer<>& buffer, int* width, int* height, |
|
int* channels, int* batch) |
|
{ |
|
CV_Assert(buffer.dimensions() == 4); |
|
*width = buffer.extent(0); |
|
*height = buffer.extent(1); |
|
*channels = buffer.extent(2); |
|
*batch = buffer.extent(3); |
|
} |
|
|
|
HalideBackendNode::HalideBackendNode(const Halide::Func& func) |
|
: BackendNode(DNN_BACKEND_HALIDE), funcs(1, func) {} |
|
|
|
HalideBackendNode::HalideBackendNode(const std::vector<Halide::Func>& funcs) |
|
: BackendNode(DNN_BACKEND_HALIDE), funcs(funcs) {} |
|
|
|
HalideBackendNode::HalideBackendNode(const Ptr<HalideBackendNode>& base, |
|
const Halide::Func& top) |
|
: BackendNode(DNN_BACKEND_HALIDE), funcs(base->funcs) |
|
{ |
|
funcs.back() = top; |
|
} |
|
|
|
HalideBackendWrapper::HalideBackendWrapper(int targetId, const cv::Mat& m) |
|
: BackendWrapper(DNN_BACKEND_HALIDE, targetId) |
|
{ |
|
managesDevMemory = true; |
|
buffer = wrapToHalideBuffer(m); |
|
if (targetId == DNN_TARGET_CPU) |
|
{ |
|
return; |
|
} |
|
else if (targetId == DNN_TARGET_OPENCL) |
|
{ |
|
Halide::Target t = Halide::get_host_target(); |
|
t.set_feature(Halide::Target::OpenCL); |
|
buffer.copy_to_device(t); |
|
} |
|
else |
|
CV_Error(Error::StsNotImplemented, "Unknown target identifier"); |
|
} |
|
|
|
HalideBackendWrapper::HalideBackendWrapper(const Ptr<BackendWrapper>& base, |
|
const MatShape& shape) |
|
: BackendWrapper(DNN_BACKEND_HALIDE, base->targetId) |
|
{ |
|
managesDevMemory = false; |
|
Halide::Buffer<float> baseBuffer = halideBuffer(base); |
|
buffer = Halide::Buffer<float>((float*)baseBuffer.raw_buffer()->host, |
|
getBufferShape(shape)); |
|
if (baseBuffer.has_device_allocation()) |
|
{ |
|
buffer.raw_buffer()->device = baseBuffer.raw_buffer()->device; |
|
buffer.raw_buffer()->device_interface = baseBuffer.raw_buffer()->device_interface; |
|
buffer.set_device_dirty(); |
|
} |
|
else |
|
{ |
|
buffer.set_host_dirty(); // Indicate that data is on CPU. |
|
CV_Assert(targetId == DNN_TARGET_CPU); |
|
} |
|
} |
|
|
|
HalideBackendWrapper::~HalideBackendWrapper() |
|
{ |
|
if (buffer.has_device_allocation() && !managesDevMemory) |
|
{ |
|
buffer.raw_buffer()->device = 0; |
|
buffer.raw_buffer()->device_interface = 0; |
|
buffer.set_device_dirty(false); |
|
} |
|
} |
|
|
|
void HalideBackendWrapper::copyToHost() |
|
{ |
|
if (buffer.device_dirty()) |
|
{ |
|
buffer.device_sync(); |
|
buffer.copy_to_host(); |
|
} |
|
} |
|
|
|
void HalideBackendWrapper::setHostDirty() |
|
{ |
|
buffer.set_device_dirty(false); |
|
buffer.set_host_dirty(); |
|
} |
|
#endif // HAVE_HALIDE |
|
|
|
void getCanonicalSize(const MatSize& size, int* w, int* h, int* c, int* n) |
|
{ |
|
getCanonicalSize(shape(size), w, h, c, n); |
|
} |
|
|
|
void getCanonicalSize(const MatShape& shape, int* width, int* height, |
|
int* channels, int* batch) |
|
{ |
|
const int dims = shape.size(); |
|
CV_Assert(dims == 2 || dims == 4); |
|
*batch = shape[0]; |
|
*channels = shape[1]; |
|
if (dims == 4) |
|
{ |
|
*width = shape[3]; |
|
*height = shape[2]; |
|
} |
|
else |
|
{ |
|
*width = 1; |
|
*height = 1; |
|
} |
|
} |
|
|
|
void compileHalide(const std::vector<Mat> &outputs, Ptr<BackendNode>& node, int targetId) |
|
{ |
|
#ifdef HAVE_HALIDE |
|
CV_Assert(!node.empty()); |
|
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back(); |
|
|
|
int outW, outH, outC, outN; |
|
Halide::Var x("x"), y("y"), c("c"), n("n"); |
|
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); |
|
top.bound(x, 0, outW).bound(y, 0, outH) |
|
.bound(c, 0, outC).bound(n, 0, outN); |
|
|
|
Halide::Target target = Halide::get_host_target(); |
|
target.set_feature(Halide::Target::NoAsserts); |
|
if (targetId == DNN_TARGET_OPENCL) |
|
{ |
|
target.set_feature(Halide::Target::OpenCL); |
|
} |
|
CV_Assert(target.supported()); |
|
top.compile_jit(target); |
|
#endif // HAVE_HALIDE |
|
} |
|
|
|
void forwardHalide(std::vector<Ptr<BackendWrapper> > &outputs, |
|
const Ptr<BackendNode>& node) |
|
{ |
|
#ifdef HAVE_HALIDE |
|
CV_Assert(!node.empty()); |
|
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back(); |
|
auto outputBuffers = halideBuffers(outputs); |
|
top.realize(Halide::Realization(outputBuffers)); |
|
#endif // HAVE_HALIDE |
|
} |
|
|
|
bool haveHalide() |
|
{ |
|
#ifdef HAVE_HALIDE |
|
return true; |
|
#else |
|
return false; |
|
#endif // HAVE_HALIDE |
|
} |
|
|
|
|
|
CV__DNN_INLINE_NS_BEGIN |
|
|
|
|
|
void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs, |
|
const std::vector<Mat> &outputs, int targetId) const |
|
{ |
|
#ifndef HAVE_HALIDE |
|
CV_Error(Error::StsNotImplemented, ""); |
|
#else |
|
CV_TRACE_FUNCTION(); |
|
|
|
Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"), |
|
xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile"); |
|
Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back(); |
|
|
|
int outW, outH, outC, outN; |
|
getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN); |
|
|
|
if (targetId == DNN_TARGET_CPU) |
|
{ |
|
if (outW == 1 && outH == 1) |
|
{ |
|
if (outC + outN == 1) |
|
return; |
|
|
|
if (outC > 8) |
|
top.split(c, co, ci, 8) |
|
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) |
|
.parallel(tile) |
|
.vectorize(ci, 8); |
|
else |
|
top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile) |
|
.parallel(tile); |
|
} |
|
else |
|
{ |
|
if (outH > 2) |
|
{ |
|
top.reorder(x, c, y) |
|
.split(y, yo, yi, 2) |
|
.fuse(yo, n, tile) |
|
.parallel(tile) |
|
.unroll(yi) |
|
.vectorize(x, outW >= 16 ? 16 : outW); |
|
} |
|
} |
|
} |
|
else if (targetId == DNN_TARGET_OPENCL) |
|
{ |
|
if (outW == 1 && outH == 1) |
|
{ |
|
int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC; |
|
top.split(c, co, ci, c_split) |
|
.fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile) |
|
.gpu_blocks(tile) |
|
.gpu_threads(ci); |
|
} |
|
else |
|
{ |
|
int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW; |
|
int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH; |
|
// Supported vectorization widths: 2, 3, 4, 8, 16 |
|
int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC); |
|
top.split(x, xo, xi, x_split).split(y, yo, yi, y_split) |
|
.split(c, co, ci, c_split) |
|
.gpu_blocks(xo, yo, co) |
|
.gpu_threads(xi, yi) |
|
.reorder(xi, yi, ci, xo, yo, co) |
|
.vectorize(ci); |
|
} |
|
} |
|
else |
|
CV_Error(Error::StsNotImplemented, "Unknown target identifier"); |
|
#endif // HAVE_HALIDE |
|
} |
|
|
|
|
|
CV__DNN_INLINE_NS_END |
|
}} // namespace
|
|
|