Merge pull request #16161 from YashasSamaga:cuda4dnn-concat-fusion

cuda4dnn(concat): write outputs from previous layers directly into concat's output

* eliminate concat by directly writing to its output buffer

* fix concat fusion not happening sometimes

* use a whitelist instead of a blacklist
pull/16627/head
Yashas Samaga B L 5 years ago committed by GitHub
parent 77dd40c96c
commit 1f695c4532
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 48
      modules/dnn/src/dnn.cpp
  2. 24
      modules/dnn/src/op_cuda.hpp

@ -2470,7 +2470,9 @@ struct Net::Impl
ld.layerInstance->type != "Concat")) )
continue;
if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) && ld.layerInstance->type != "Convolution")
if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget)
&& ld.layerInstance->type != "Convolution"
&& ld.layerInstance->type != "Concat")
continue;
while (nextData)
@ -2626,7 +2628,7 @@ struct Net::Impl
}
}
if (preferableBackend != DNN_BACKEND_OPENCV)
if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
continue; // Go to the next layer.
// the optimization #2. if there is concat layer that concatenates channels
@ -2694,6 +2696,15 @@ struct Net::Impl
if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
break;
#ifdef HAVE_CUDA
if (preferableBackend == DNN_BACKEND_CUDA &&
(inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
(inp_i_data->layerInstance->type != "Convolution" &&
inp_i_data->layerInstance->type != "Pooling")))
{
break;
}
#endif
realinputs[i] = pin;
}
@ -2711,6 +2722,10 @@ struct Net::Impl
umats[0] = umat_output;
OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
}
#endif
#ifdef HAVE_CUDA
if (preferableBackend == DNN_BACKEND_CUDA)
ld.outputBlobsWrappers[0] = wrap(output);
#endif
Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
int ofs = 0;
@ -2735,11 +2750,40 @@ struct Net::Impl
umats[pin.oid] = umat_output(chrange);
OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
}
#endif
#ifdef HAVE_CUDA
if (preferableBackend == DNN_BACKEND_CUDA)
{
auto cuda_wrapper = wrap(output).dynamicCast<CUDABackendWrapper>();
auto offset = chrange[1].start * (output.size[2] * output.size[3]);
auto shape = MatShape{1, chrange[1].size(), output.size[2], output.size[3]};
cuda_wrapper->update(shape, offset);
inp_i_data->outputBlobsWrappers[pin.oid] = cuda_wrapper.staticCast<BackendWrapper>();
}
#endif
// Layers that refer old input Mat will refer to the
// new data but the same Mat object.
CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
}
#ifdef HAVE_CUDA
if (preferableBackend == DNN_BACKEND_CUDA)
{
for (int i = 0; i < ld.consumers.size(); i++)
{
LayerData& consumer = layers[ld.consumers[i].lid];
for (int j = 0; j < consumer.inputBlobsId.size(); j++)
{
if (consumer.inputBlobsId[j].lid == ld.id)
{
CV_Assert(consumer.inputBlobs[j]->data == ld.outputBlobs[0].data);
consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
break;
}
}
}
}
#endif
ld.skip = true;
printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
}

@ -217,6 +217,8 @@ namespace cv { namespace dnn {
/** @note setting the stream updates the stream for all wrappers which use the same tensor */
virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0;
virtual void update(const MatShape& shape, std::size_t offset) = 0;
};
namespace cuda4dnn { namespace detail {
@ -276,6 +278,7 @@ namespace cv { namespace dnn {
: CUDABackendWrapper(TargetID)
{
shape = cv::dnn::shape(m);
offset = 0;
shared_block = std::make_shared<shared_block_type>();
shared_block->host_dirty = true;
@ -300,6 +303,7 @@ namespace cv { namespace dnn {
CV_Assert(base);
shape = shape_;
offset = 0;
shared_block = base->shared_block;
}
@ -313,6 +317,8 @@ namespace cv { namespace dnn {
void copyToHost() override {
if (shared_block->device_dirty) {
CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
shared_block->host_dirty = false;
shared_block->device_dirty = false;
@ -339,6 +345,8 @@ namespace cv { namespace dnn {
void copyToDevice() override {
if (shared_block->host_dirty) {
CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
shared_block->host_dirty = false;
shared_block->device_dirty = false;
@ -365,13 +373,24 @@ namespace cv { namespace dnn {
shared_block->stream = std::move(stream);
}
void update(const MatShape& shape_, std::size_t offset_) override {
auto total = std::accumulate(std::begin(shape_), std::end(shape_), 1, std::multiplies<MatShape::value_type>());
if (offset_ + total > shared_block->device.size()) {
CV_Error(Error::BadOffset, "shape and offset provided can potentially leads to OOB access");
}
shape = shape_;
offset = offset_;
}
cv::Mat getMutableHostMat() noexcept {
CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
copyToHost();
setHostDirty();
return shared_block->host;
}
const cv::Mat getImmutableHostMat() const noexcept {
CV_Assert(offset == 0); /* we cannot track each piece of the memory separately */
copyToHost();
return shared_block->host;
}
@ -388,12 +407,12 @@ namespace cv { namespace dnn {
*/
tensor_span_type getSpan() noexcept {
setDeviceDirty();
return tensor_span_type(shared_block->device.get(), std::begin(shape), std::end(shape));
return tensor_span_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape));
}
tensor_view_type getView() noexcept {
copyToDevice();
return tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape));
return tensor_view_type(shared_block->device.get() + offset, std::begin(shape), std::end(shape));
}
private:
@ -407,6 +426,7 @@ namespace cv { namespace dnn {
*/
MatShape shape;
std::size_t offset;
struct shared_block_type {
bool host_dirty;

Loading…
Cancel
Save