From 4b52b8df348ba67d1e34834b624cef6bd5ef040f Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Thu, 12 Oct 2017 18:29:17 +0300 Subject: [PATCH] Layers for fast-neural-style models: https://github.com/jcjohnson/fast-neural-style --- .../dnn/include/opencv2/dnn/all_layers.hpp | 1 + .../dnn/include/opencv2/dnn/shape_utils.hpp | 14 +++-- modules/dnn/src/layers/padding_layer.cpp | 44 +++++++++++++- modules/dnn/src/layers/slice_layer.cpp | 32 ++++++---- modules/dnn/src/torch/torch_importer.cpp | 58 ++++++++++++++++--- modules/dnn/test/test_torch_importer.cpp | 44 ++++++++++++++ samples/dnn/fast_neural_style.py | 51 ++++++++++++++++ 7 files changed, 218 insertions(+), 26 deletions(-) create mode 100644 samples/dnn/fast_neural_style.py diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp index b65f07555d..539855a8a3 100644 --- a/modules/dnn/include/opencv2/dnn/all_layers.hpp +++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp @@ -377,6 +377,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN * starting from the first one. The rest of dimensions won't * be padded. * @param value Value to be padded. Defaults to zero. + * @param type Padding type: 'constant', 'reflect' * @param input_dims Torch's parameter. If @p input_dims is not equal to the * actual input dimensionality then the `[0]th` dimension * is considered as a batch dimension and @p paddings are shifted diff --git a/modules/dnn/include/opencv2/dnn/shape_utils.hpp b/modules/dnn/include/opencv2/dnn/shape_utils.hpp index e74b53cce8..4a77473d96 100644 --- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp +++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp @@ -112,16 +112,12 @@ static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const static inline Mat getPlane(const Mat &m, int n, int cn) { CV_Assert(m.dims > 2); - Range range[CV_MAX_DIM]; int sz[CV_MAX_DIM]; for(int i = 2; i < m.dims; i++) { sz[i-2] = m.size.p[i]; - range[i] = Range::all(); } - range[0] = Range(n, n+1); - range[1] = Range(cn, cn+1); - return m(range).reshape(1, m.dims-2, sz); + return Mat(m.dims - 2, sz, m.type(), (void*)m.ptr(n, cn)); } static inline MatShape shape(const int* dims, const int n = 4) @@ -191,6 +187,14 @@ inline int clamp(int ax, const MatShape& shape) return clamp(ax, (int)shape.size()); } +inline Range clamp(const Range& r, int axisSize) +{ + Range clamped(std::max(r.start, 0), + r.end > 0 ? std::min(r.end, axisSize) : axisSize + r.end + 1); + CV_Assert(clamped.start < clamped.end, clamped.end <= axisSize); + return clamped; +} + CV__DNN_EXPERIMENTAL_NS_END } } diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp index 393c8474ed..ae62ee4560 100644 --- a/modules/dnn/src/layers/padding_layer.cpp +++ b/modules/dnn/src/layers/padding_layer.cpp @@ -10,6 +10,7 @@ Implementation of padding layer, which adds paddings to input blob. */ #include "../precomp.hpp" +#include "layers_common.hpp" #include "op_halide.hpp" #include @@ -26,6 +27,7 @@ public: setParamsFrom(params); paddingValue = params.get("value", 0); inputDims = params.get("input_dims", -1); + paddingType = params.get("type", "constant"); CV_Assert(params.has("paddings")); const DictValue& paddingsParam = params.get("paddings"); @@ -94,8 +96,45 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - outputs[0].setTo(paddingValue); - inputs[0]->copyTo(outputs[0](dstRanges)); + if (paddingType == "constant") + { + outputs[0].setTo(paddingValue); + inputs[0]->copyTo(outputs[0](dstRanges)); + } + else if (paddingType == "reflect") + { + CV_Assert(inputs.size() == 1); + CV_Assert(outputs.size() == 1); + CV_Assert(inputs[0]->dims == 4); + CV_Assert(outputs[0].dims == 4); + + if (inputs[0]->size[0] != outputs[0].size[0] || inputs[0]->size[1] != outputs[0].size[1]) + CV_Error(Error::StsNotImplemented, "Only spatial reflection padding is supported."); + + const int inpHeight = inputs[0]->size[2]; + const int inpWidth = inputs[0]->size[3]; + const int outHeight = outputs[0].size[2]; + const int outWidth = outputs[0].size[3]; + const int padTop = dstRanges[2].start; + const int padBottom = outHeight - dstRanges[2].end; + const int padLeft = dstRanges[3].start; + const int padRight = outWidth - dstRanges[3].end; + CV_Assert(padTop < inpHeight, padBottom < inpHeight, + padLeft < inpWidth, padRight < inpWidth); + + for (size_t n = 0; n < inputs[0]->size[0]; ++n) + { + for (size_t ch = 0; ch < inputs[0]->size[1]; ++ch) + { + copyMakeBorder(getPlane(*inputs[0], n, ch), + getPlane(outputs[0], n, ch), + padTop, padBottom, padLeft, padRight, + BORDER_REFLECT_101); + } + } + } + else + CV_Error(Error::StsNotImplemented, "Unknown padding type: " + paddingType); } virtual Ptr initHalide(const std::vector > &inputs) @@ -124,6 +163,7 @@ private: std::vector dstRanges; int inputDims; float paddingValue; + std::string paddingType; }; Ptr PaddingLayer::create(const LayerParams ¶ms) diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index c7db0f46bf..07a670bf31 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -58,7 +58,7 @@ public: axis = params.get("axis", 1); if (params.has("slice_point")) { - CV_Assert(!params.has("begin") && !params.has("size")); + CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end")); const DictValue &indicesValue = params.get("slice_point"); sliceRanges.resize(indicesValue.size() + 1, std::vector(axis + 1, Range::all())); @@ -71,24 +71,34 @@ public: } sliceRanges.back()[axis].start = prevSlice; } - else if (params.has("begin") && params.has("size")) + else if (params.has("begin")) { + CV_Assert(params.has("size") ^ params.has("end")); const DictValue &begins = params.get("begin"); - const DictValue &sizes = params.get("size"); - CV_Assert(begins.size() == sizes.size()); + const DictValue &sizesOrEnds = params.has("size") ? params.get("size") : params.get("end"); + CV_Assert(begins.size() == sizesOrEnds.size()); sliceRanges.resize(1); sliceRanges[0].resize(begins.size(), Range::all()); for (int i = 0; i < begins.size(); ++i) { int start = begins.get(i); - int size = sizes.get(i); + int sizeOrEnd = sizesOrEnds.get(i); // It may be negative to reverse indexation. CV_Assert(start >= 0); - CV_Assert(size == -1 || size > 0); // -1 value means range [start, axis_size). sliceRanges[0][i].start = start; - if (size > 0) - sliceRanges[0][i].end = start + size; + if (params.has("size")) + { + int size = sizeOrEnd; + CV_Assert(size == -1 || size > 0); // -1 value means range [start, axis_size). + sliceRanges[0][i].end = start > 0 ? start + size : -1; // We'll finalize a negative value later. + } + else + { + int end = sizeOrEnd; + CV_Assert(end < 0 || end > start); // End index is excluded. + sliceRanges[0][i].end = end; // We'll finalize a negative value later. + } } } } @@ -109,8 +119,7 @@ public: CV_Assert(sliceRanges[i].size() <= inpShape.size()); for (int j = 0; j < sliceRanges[i].size(); ++j) { - outputs[i][j] = std::min(sliceRanges[i][j].end, inpShape[j]) - - std::max(sliceRanges[i][j].start, 0); + outputs[i][j] = clamp(sliceRanges[i][j], inpShape[j]).size(); } } } @@ -152,8 +161,7 @@ public: // Clamp. for (int j = 0; j < sliceRanges[i].size(); ++j) { - sliceRanges[i][j].start = std::max(0, sliceRanges[i][j].start); - sliceRanges[i][j].end = std::min(sliceRanges[i][j].end, inpShape[j]); + sliceRanges[i][j] = clamp(sliceRanges[i][j], inpShape[j]); } // Fill the rest of ranges. for (int j = sliceRanges[i].size(); j < inpShape[-1]; ++j) diff --git a/modules/dnn/src/torch/torch_importer.cpp b/modules/dnn/src/torch/torch_importer.cpp index e0699f25c7..37227c06f1 100644 --- a/modules/dnn/src/torch/torch_importer.cpp +++ b/modules/dnn/src/torch/torch_importer.cpp @@ -617,7 +617,7 @@ struct TorchImporter : public ::cv::dnn::Importer curModule->modules.push_back(cv::Ptr(new Module(nnName, "Sigmoid"))); readObject(); } - else if (nnName == "SpatialBatchNormalization") + else if (nnName == "SpatialBatchNormalization" || nnName == "InstanceNormalization") { newModule->apiType = "BatchNorm"; readTorchTable(scalarParams, tensorParams); @@ -626,19 +626,31 @@ struct TorchImporter : public ::cv::dnn::Importer float eps = float(scalarParams.get("eps")); layerParams.set("eps", eps); - CV_Assert((tensorParams.count("running_var") || tensorParams.count("running_std")) && - tensorParams.count("running_mean")); - layerParams.blobs.push_back(tensorParams["running_mean"].second); + if (tensorParams.count("running_mean")) + { + layerParams.blobs.push_back(tensorParams["running_mean"].second); + } + else + { + CV_Assert(scalarParams.has("nOutput")); + layerParams.blobs.push_back(Mat::zeros(1, scalarParams.get("nOutput"), CV_32F)); + } + if (tensorParams.count("running_var")) { layerParams.blobs.push_back(tensorParams["running_var"].second); } - else + else if (tensorParams.count("running_std")) { layerParams.blobs.push_back(tensorParams["running_std"].second); pow(layerParams.blobs.back(), -2, layerParams.blobs.back()); subtract(layerParams.blobs.back(), eps, layerParams.blobs.back()); } + else + { + CV_Assert(scalarParams.has("nOutput")); + layerParams.blobs.push_back(Mat::ones(1, scalarParams.get("nOutput"), CV_32F)); + } if (tensorParams.count("weight")) { @@ -652,6 +664,16 @@ struct TorchImporter : public ::cv::dnn::Importer layerParams.blobs.push_back(tensorParams["bias"].second); } + if (nnName == "InstanceNormalization") + { + cv::Ptr mvnModule(new Module(nnName)); + mvnModule->apiType = "MVN"; + curModule->modules.push_back(mvnModule); + + layerParams.blobs[0].setTo(0); // batch norm's mean + layerParams.blobs[1].setTo(1); // batch norm's std + } + curModule->modules.push_back(newModule); } else if (nnName == "PReLU") @@ -691,7 +713,9 @@ struct TorchImporter : public ::cv::dnn::Importer layerParams.set("scale", scale); curModule->modules.push_back(newModule); } - else if (nnName == "Identity") + // TotalVariation layer is from fast-neural-style project: https://github.com/jcjohnson/fast-neural-style + // It's a loss function that has an Identity forward. + else if (nnName == "Identity" || nnName == "TotalVariation") { readTorchTable(scalarParams, tensorParams); newModule->apiType = "Identity"; @@ -866,7 +890,7 @@ struct TorchImporter : public ::cv::dnn::Importer layerParams.set("scale", scalarParams.get("constant_scalar")); curModule->modules.push_back(newModule); } - else if (nnName == "SpatialZeroPadding") + else if (nnName == "SpatialZeroPadding" || nnName == "SpatialReflectionPadding") { readTorchTable(scalarParams, tensorParams); CV_Assert(scalarParams.has("pad_l"), scalarParams.has("pad_r"), @@ -889,6 +913,26 @@ struct TorchImporter : public ::cv::dnn::Importer paddings[5] = padRight; layerParams.set("paddings", DictValue::arrayInt(&paddings[0], paddings.size())); layerParams.set("input_dims", 3); + + if (nnName == "SpatialReflectionPadding") + layerParams.set("type", "reflect"); + + curModule->modules.push_back(newModule); + } + else if (nnName == "ShaveImage") + { + // ShaveImage layer is from fast-neural-style project: https://github.com/jcjohnson/fast-neural-style + // It may be mapped to Slice layer. + readTorchTable(scalarParams, tensorParams); + CV_Assert(scalarParams.has("size")); + int size = scalarParams.get("size"); + + int begins[] = {0, 0, size, size}; + int ends[] = {-1, -1, -size - 1, -size - 1}; + + newModule->apiType = "Slice"; + layerParams.set("begin", DictValue::arrayInt(&begins[0], 4)); + layerParams.set("end", DictValue::arrayInt(&ends[0], 4)); curModule->modules.push_back(newModule); } else diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index 8c894ae7d9..5015d5dd0f 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -231,6 +231,7 @@ TEST(Torch_Importer, net_padding) { runTorchNet("net_padding", DNN_TARGET_CPU, "", false, true); runTorchNet("net_spatial_zero_padding", DNN_TARGET_CPU, "", false, true); + runTorchNet("net_spatial_reflection_padding", DNN_TARGET_CPU, "", false, true); } TEST(Torch_Importer, ENet_accuracy) @@ -338,6 +339,49 @@ OCL_TEST(Torch_Importer, ENet_accuracy) } } +// Check accuracy of style transfer models from https://github.com/jcjohnson/fast-neural-style +// th fast_neural_style.lua \ +// -input_image ~/opencv_extra/testdata/dnn/googlenet_1.png \ +// -output_image lena.png \ +// -median_filter 0 \ +// -image_size 0 \ +// -model models/eccv16/starry_night.t7 +// th fast_neural_style.lua \ +// -input_image ~/opencv_extra/testdata/dnn/googlenet_1.png \ +// -output_image lena.png \ +// -median_filter 0 \ +// -image_size 0 \ +// -model models/instance_norm/feathers.t7 +TEST(Torch_Importer, FastNeuralStyle_accuracy) +{ + std::string models[] = {"dnn/fast_neural_style_eccv16_starry_night.t7", + "dnn/fast_neural_style_instance_norm_feathers.t7"}; + std::string targets[] = {"dnn/lena_starry_night.png", "dnn/lena_feathers.png"}; + + for (int i = 0; i < 2; ++i) + { + const string model = findDataFile(models[i], false); + Net net = readNetFromTorch(model); + + Mat img = imread(findDataFile("dnn/googlenet_1.png", false)); + Mat inputBlob = blobFromImage(img, 1.0, Size(), Scalar(103.939, 116.779, 123.68), false); + + net.setInput(inputBlob); + Mat out = net.forward(); + + // Deprocessing. + getPlane(out, 0, 0) += 103.939; + getPlane(out, 0, 1) += 116.779; + getPlane(out, 0, 2) += 123.68; + out = cv::min(cv::max(0, out), 255); + + Mat ref = imread(findDataFile(targets[i])); + Mat refBlob = blobFromImage(ref, 1.0, Size(), Scalar(), false); + + normAssert(out, refBlob, "", 0.5, 1.1); + } +} + } #endif diff --git a/samples/dnn/fast_neural_style.py b/samples/dnn/fast_neural_style.py new file mode 100644 index 0000000000..945b25571b --- /dev/null +++ b/samples/dnn/fast_neural_style.py @@ -0,0 +1,51 @@ +import cv2 as cv +import numpy as np +import argparse + +parser = argparse.ArgumentParser( + description='This script is used to run style transfer models from ' + 'https://github.com/jcjohnson/fast-neural-style using OpenCV') +parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera') +parser.add_argument('--model', help='Path to .t7 model') +parser.add_argument('--width', default=-1, type=int, help='Resize input to specific width.') +parser.add_argument('--height', default=-1, type=int, help='Resize input to specific height.') +parser.add_argument('--median_filter', default=0, type=int, help='Kernel size of postprocessing blurring.') +args = parser.parse_args() + +net = cv.dnn.readNetFromTorch(args.model) + +if args.input: + cap = cv.VideoCapture(args.input) +else: + cap = cv.VideoCapture(0) + +cv.namedWindow('Styled image', cv.WINDOW_NORMAL) +while cv.waitKey(1) < 0: + hasFrame, frame = cap.read() + if not hasFrame: + cv.waitKey() + break + + inWidth = args.width if args.width != -1 else frame.shape[1] + inHeight = args.height if args.height != -1 else frame.shape[0] + inp = cv.dnn.blobFromImage(frame, 1.0, (inWidth, inHeight), + (103.939, 116.779, 123.68), swapRB=False, crop=False) + + net.setInput(inp) + out = net.forward() + + out = out.reshape(3, out.shape[2], out.shape[3]) + out[0] += 103.939 + out[1] += 116.779 + out[2] += 123.68 + out /= 255 + out = out.transpose(1, 2, 0) + + t, _ = net.getPerfProfile() + freq = cv.getTickFrequency() / 1000 + print t / freq, 'ms' + + if args.median_filter: + out = cv.medianBlur(out, args.median_filter) + + cv.imshow('Styled image', out)