diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index 70f9a5a73e..c1236c4653 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -9,6 +9,7 @@ endif() set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass") ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV) +ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX) ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js) diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp index 794bfeedda..fbe16850d4 100644 --- a/modules/dnn/include/opencv2/dnn/all_layers.hpp +++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp @@ -258,6 +258,14 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams& params); }; + class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer + { + public: + int input_zp, output_zp; + float output_sc; + static Ptr create(const LayerParams& params); + }; + class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer { public: @@ -300,6 +308,13 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams& params); }; + class CV_EXPORTS PoolingLayerInt8 : public PoolingLayer + { + public: + int input_zp, output_zp; + static Ptr create(const LayerParams& params); + }; + class CV_EXPORTS SoftmaxLayer : public Layer { public: @@ -308,6 +323,14 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams& params); }; + class CV_EXPORTS SoftmaxLayerInt8 : public SoftmaxLayer + { + public: + float output_sc; + int output_zp; + static Ptr create(const LayerParams& params); + }; + class CV_EXPORTS InnerProductLayer : public Layer { public: @@ -315,6 +338,13 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams& params); }; + class CV_EXPORTS InnerProductLayerInt8 : public InnerProductLayer + { + public: + int output_zp; + static Ptr create(const LayerParams& params); + }; + class CV_EXPORTS MVNLayer : public Layer { public: @@ -341,6 +371,22 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams ¶ms); }; + class CV_EXPORTS QuantizeLayer : public Layer + { + public: + float scale; + int zeropoint; + static Ptr create(const LayerParams ¶ms); + }; + + class CV_EXPORTS DequantizeLayer : public Layer + { + public: + float scale; + int zeropoint; + static Ptr create(const LayerParams ¶ms); + }; + class CV_EXPORTS ConcatLayer : public Layer { public: @@ -352,6 +398,7 @@ CV__DNN_INLINE_NS_BEGIN * Details: https://github.com/torch/nn/blob/master/doc/containers.md#depthconcat */ bool padding; + int paddingValue; static Ptr create(const LayerParams ¶ms); }; @@ -459,7 +506,11 @@ CV__DNN_INLINE_NS_BEGIN { public: virtual void forwardSlice(const float* src, float* dst, int len, - size_t outPlaneSize, int cn0, int cn1) const = 0; + size_t outPlaneSize, int cn0, int cn1) const {}; + virtual void forwardSlice(const int* src, const int* lut, int* dst, int len, + size_t outPlaneSize, int cn0, int cn1) const {}; + virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len, + size_t outPlaneSize, int cn0, int cn1) const {}; }; class CV_EXPORTS ReLULayer : public ActivationLayer @@ -542,6 +593,12 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams ¶ms); }; + class CV_EXPORTS ActivationLayerInt8 : public ActivationLayer + { + public: + static Ptr create(const LayerParams ¶ms); + }; + /* Layers used in semantic segmentation */ class CV_EXPORTS CropLayer : public Layer @@ -563,6 +620,12 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams ¶ms); }; + class CV_EXPORTS EltwiseLayerInt8 : public Layer + { + public: + static Ptr create(const LayerParams ¶ms); + }; + class CV_EXPORTS BatchNormLayer : public ActivationLayer { public: @@ -572,6 +635,14 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams ¶ms); }; + class CV_EXPORTS BatchNormLayerInt8 : public BatchNormLayer + { + public: + float input_sc, output_sc; + int input_zp, output_zp; + static Ptr create(const LayerParams ¶ms); + }; + class CV_EXPORTS MaxUnpoolLayer : public Layer { public: @@ -591,12 +662,26 @@ CV__DNN_INLINE_NS_BEGIN static Ptr create(const LayerParams& params); }; + class CV_EXPORTS ScaleLayerInt8 : public ScaleLayer + { + public: + float output_sc; + int output_zp; + static Ptr create(const LayerParams ¶ms); + }; + class CV_EXPORTS ShiftLayer : public Layer { public: static Ptr create(const LayerParams& params); }; + class CV_EXPORTS ShiftLayerInt8 : public Layer + { + public: + static Ptr create(const LayerParams& params); + }; + class CV_EXPORTS DataAugmentationLayer : public Layer { public: diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index a498039f65..bf1670051a 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -235,6 +235,15 @@ CV__DNN_INLINE_NS_BEGIN */ virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals); + /** @brief Tries to quantize the given layer and compute the quantization parameters required for fixed point implementation. + * @param[in] scales input and output scales. + * @param[in] zeropoints input and output zeropoints. + * @param[out] params Quantized parameters required for fixed point implementation of that layer. + * @returns True if layer can be quantized. + */ + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params); + /** @brief Given the @p input blobs, computes the output @p blobs. * @param[in] inputs the input blobs. * @param[out] outputs allocated output blobs, which will store results of the computation. @@ -368,6 +377,16 @@ CV__DNN_INLINE_NS_BEGIN */ virtual void getScaleShift(Mat& scale, Mat& shift) const; + /** + * @brief Returns scale and zeropoint of layers + * @param[out] scale Output scale + * @param[out] zeropoint Output zeropoint + * + * By default, @p scale is 1 and @p zeropoint is 0. + */ + virtual void getScaleZeropoint(float& scale, int& zeropoint) const; + + /** * @brief "Deattaches" all the layers, attached to particular layer. */ @@ -453,13 +472,21 @@ CV__DNN_INLINE_NS_BEGIN /** @brief Adds new layer to the net. * @param name unique name of the adding layer. * @param type typename of the adding layer (type must be registered in LayerRegister). + * @param dtype datatype of output blobs. * @param params parameters which will be used to initialize the creating layer. * @returns unique identifier of created layer, or -1 if a failure will happen. */ + int addLayer(const String &name, const String &type, const int &dtype, LayerParams ¶ms); + + /** @overload Datatype of output blobs set to default CV_32F */ int addLayer(const String &name, const String &type, LayerParams ¶ms); + /** @brief Adds new layer and connects its first input to the first output of previously added layer. * @see addLayer() */ + int addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams ¶ms); + + /** @overload */ int addLayerToPrev(const String &name, const String &type, LayerParams ¶ms); /** @brief Converts string name of the layer to the integer identifier. @@ -551,6 +578,25 @@ CV__DNN_INLINE_NS_BEGIN CV_WRAP_AS(forwardAndRetrieve) void forward(CV_OUT std::vector >& outputBlobs, const std::vector& outBlobNames); + /** @brief Returns a quantized Net from a floating-point Net. + * @param calibData Calibration data to compute the quantization parameters. + * @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S. + * @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S. + */ + CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype); + + /** @brief Returns input scale and zeropoint for a quantized Net. + * @param scales output parameter for returning input scales. + * @param zeropoints output parameter for returning input zeropoints. + */ + CV_WRAP void getInputDetails(CV_OUT std::vector& scales, CV_OUT std::vector& zeropoints) const; + + /** @brief Returns output scale and zeropoint for a quantized Net. + * @param scales output parameter for returning output scales. + * @param zeropoints output parameter for returning output zeropoints. + */ + CV_WRAP void getOutputDetails(CV_OUT std::vector& scales, CV_OUT std::vector& zeropoints) const; + /** * @brief Compile Halide layers. * @param[in] scheduler Path to YAML file with scheduling directives. diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 52a5fcba28..492ad166d0 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -574,9 +574,9 @@ struct LayerPin struct LayerData { - LayerData() : id(-1), skip(false), flag(0) {} - LayerData(int _id, const String &_name, const String &_type, LayerParams &_params) - : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0) + LayerData() : id(-1), dtype(CV_32F), skip(false), flag(0) {} + LayerData(int _id, const String &_name, const String &_type, const int &_dtype, LayerParams &_params) + : id(_id), name(_name), type(_type), dtype(_dtype), params(_params), skip(false), flag(0) { CV_TRACE_FUNCTION(); @@ -588,6 +588,7 @@ struct LayerData int id; String name; String type; + int dtype; // Datatype of output blobs. LayerParams params; std::vector inputBlobsId; @@ -944,7 +945,7 @@ public: } } - void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half) + void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, const int& dtype) { if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS) { @@ -966,7 +967,8 @@ public: { Mat& unusedBlob = hostIt->second; if (unusedBlob.total() >= targetTotal && - unusedBlob.total() < bestBlobTotal) + unusedBlob.total() < bestBlobTotal && + unusedBlob.type() == dtype) { bestBlobPin = hostIt->first; bestBlob = unusedBlob; @@ -985,14 +987,13 @@ public: { // if dst already has been allocated with total(shape) elements, // it won't be recreated and pointer of dst.data remains the same. - dst.create(shape, use_half ? CV_16S : CV_32F); + dst.create(shape, dtype); addHost(lp, dst); } } void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes, - std::vector& pinsForInternalBlobs, - bool use_half = false) + std::vector& pinsForInternalBlobs) { CV_TRACE_FUNCTION(); @@ -1063,7 +1064,7 @@ public: reuse(ld.inputBlobsId[0], blobPin); } else - reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half); + reuseOrCreate(shapes[index], blobPin, *blobs[index], ld.dtype); } } } @@ -1193,6 +1194,7 @@ struct Net::Impl : public detail::NetImplBase lastLayerId = 0; netWasAllocated = false; + netWasQuantized = false; fusion = true; isAsync = false; preferableBackend = DNN_BACKEND_DEFAULT; @@ -1217,6 +1219,7 @@ struct Net::Impl : public detail::NetImplBase int lastLayerId; bool netWasAllocated; + bool netWasQuantized; bool fusion; bool isAsync; std::vector layersTimings; @@ -1372,7 +1375,7 @@ struct Net::Impl : public detail::NetImplBase currLayer->unsetAttached(); } - + netWasAllocated = false; layersTimings.clear(); } @@ -2541,10 +2544,11 @@ struct Net::Impl : public detail::NetImplBase CV_Assert(layerShapesIt != layersShapes.end()); + if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_OPENCL_FP16 && ld.dtype == CV_32F) + ld.dtype = CV_16S; + std::vector pinsForInternalBlobs; - blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, - preferableBackend == DNN_BACKEND_OPENCV && - preferableTarget == DNN_TARGET_OPENCL_FP16); + blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs); ld.outputBlobsWrappers.resize(ld.outputBlobs.size()); for (int i = 0; i < ld.outputBlobs.size(); ++i) ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]); @@ -3148,7 +3152,8 @@ struct Net::Impl : public detail::NetImplBase Mat& inp = layers[0].outputBlobs[i]; CV_Assert(inp.total()); if (preferableBackend == DNN_BACKEND_OPENCV && - preferableTarget == DNN_TARGET_OPENCL_FP16) + preferableTarget == DNN_TARGET_OPENCL_FP16 && + layers[0].dtype == CV_32F) { layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S); } @@ -3458,6 +3463,25 @@ struct Net::Impl : public detail::NetImplBase #endif } + void getQuantizationParams(const Mat& src, std::vector& scales, std::vector& zeropoints) + { + const int qmin = -128; // INT8_MIN + const int qmax = 127; // INT8_MAX + + double rmin, rmax, sc, zp; + cv::minMaxIdx(src, &rmin, &rmax); + + // 0 must be present in the range [rmin, rmax] + rmin = std::min(rmin, 0.0); + rmax = std::max(rmax, 0.0); + + sc = (rmax == rmin) ? 1.0 : (rmax - rmin)/(qmax - qmin); + zp = qmin - (rmin/sc); + + scales.push_back((float)sc); + zeropoints.push_back((int)std::round(zp)); + } + void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes) { std::vector& inputLayerIds = layers[id].inputBlobsId; @@ -3588,7 +3612,8 @@ struct Net::Impl : public detail::NetImplBase Mat& inp = layers[0].outputBlobs[i]; CV_Assert(inp.total()); if (preferableBackend == DNN_BACKEND_OPENCV && - preferableTarget == DNN_TARGET_OPENCL_FP16) + preferableTarget == DNN_TARGET_OPENCL_FP16 && + layers[0].dtype == CV_32F) { layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S); } @@ -3614,7 +3639,7 @@ struct Net::Impl : public detail::NetImplBase const MatShape& shape = layersShapes[inputLayerId].out[inputLayerIds[i].oid]; layersShapes[layerId].in.push_back(shape); } - it->second.layerInstance->updateMemoryShapes(layersShapes[layerId].in); + it->second.getLayerInstance()->updateMemoryShapes(layersShapes[layerId].in); } } } @@ -4019,7 +4044,7 @@ Net::~Net() { } -int Net::addLayer(const String &name, const String &type, LayerParams ¶ms) +int Net::addLayer(const String &name, const String &type, const int &dtype, LayerParams ¶ms) { CV_TRACE_FUNCTION(); @@ -4042,23 +4067,35 @@ int Net::addLayer(const String &name, const String &type, LayerParams ¶ms) id = ++impl->lastLayerId; impl->layerNameToId.insert(std::make_pair(name, id)); - impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params))); + impl->layers.insert(std::make_pair(id, LayerData(id, name, type, dtype, params))); if (params.get("has_dynamic_shapes", false)) impl->hasDynamicShapes = true; return id; } -int Net::addLayerToPrev(const String &name, const String &type, LayerParams ¶ms) +int Net::addLayer(const String &name, const String &type, LayerParams ¶ms) +{ + CV_TRACE_FUNCTION(); + return addLayer(name, type, CV_32F, params); +} + +int Net::addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams ¶ms) { CV_TRACE_FUNCTION(); int prvLid = impl->lastLayerId; - int newLid = this->addLayer(name, type, params); + int newLid = this->addLayer(name, type, dtype, params); this->connect(prvLid, 0, newLid, 0); return newLid; } +int Net::addLayerToPrev(const String &name, const String &type, LayerParams ¶ms) +{ + CV_TRACE_FUNCTION(); + return addLayerToPrev(name, type, CV_32F, params); +} + void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum) { CV_TRACE_FUNCTION(); @@ -4169,16 +4206,19 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName) ld.outputBlobsWrappers[i]->copyToHost(); } } - if (ld.outputBlobs[0].depth() == CV_32F) + if (ld.outputBlobs[0].depth() == CV_16S) { - std::vector & outputvec = *(std::vector *)outputBlobs.getObj(); - outputvec = ld.outputBlobs; - } else { std::vector & outputvec = *(std::vector *)outputBlobs.getObj(); outputvec.resize(ld.outputBlobs.size()); for (int i = 0; i < outputvec.size(); i++) convertFp16(ld.outputBlobs[i], outputvec[i]); } + else + { + // Output depth can be CV_32F or CV_8S + std::vector & outputvec = *(std::vector *)outputBlobs.getObj(); + outputvec = ld.outputBlobs; + } } else if (outputBlobs.isUMatVector()) { @@ -4264,11 +4304,277 @@ void Net::forward(std::vector >& outputBlobs, } } +Net Net::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype) +{ + CV_TRACE_FUNCTION(); + + // Net can be quantized only once. + if (impl->netWasQuantized) + CV_Error(Error::StsBadArg, "Cannot quantize a quantized net"); + + CV_CheckType(inputsDtype, inputsDtype == CV_32F || inputsDtype == CV_8S, "Input depth should be CV_32F or CV_8S"); + CV_CheckType(outputsDtype, outputsDtype == CV_32F || outputsDtype == CV_8S, "Output depth should be CV_32F or CV_8S"); + + bool originalFusion = impl->fusion; + int prefBackend = impl->preferableBackend; + int prefTarget = impl->preferableTarget; + + // Disable fusions and use CPU backend to quantize net + setPreferableBackend(DNN_BACKEND_OPENCV); + setPreferableTarget(DNN_TARGET_CPU); + enableFusion(false); + + if (calibData.isMat()) + { + setInput(calibData.getMat()); + } + else if (calibData.isMatVector()) + { + std::vector calibDataVec; + calibData.getMatVector(calibDataVec); + + std::vector inpNames = impl->netInputLayer->outNames; + CV_CheckEQ(calibDataVec.size(), inpNames.size(), "Calibration data size should be equal to number of inputs"); + for (int i = 0; i < calibDataVec.size(); i++) + setInput(calibDataVec[i], inpNames[i]); + } + + std::vector outNames = getUnconnectedOutLayersNames(); + std::vector pins; + for (int i = 0; i < outNames.size(); i++) + pins.push_back(impl->getPinByAlias(outNames[i])); + impl->setUpNet(pins); + + // Compute scales and zeropoints for all the layers + std::vector > scales; + std::vector > zeropoints; + for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++) + { + LayerData& ld = it->second; + if (!ld.skip) + { + Ptr layer = ld.layerInstance; + std::vector inps(ld.inputBlobs.size()); + for (int i = 0; i < ld.inputBlobs.size(); ++i) + inps[i] = *ld.inputBlobs[i]; + layer->forward(inps, ld.outputBlobs, ld.internals); + } + + std::vector sc; + std::vector zp; + if (ld.type == "TanH") + { + sc.push_back(1.f/128); + zp.push_back(0); + } + else if (ld.type == "Sigmoid" || ld.type == "Softmax" || ld.type == "SoftMax") + { + if (ld.params.get("log_softmax", false)) + { + sc.push_back(16.f/256); + zp.push_back(127); + } + else + { + sc.push_back(1.f/256); + zp.push_back(-128); + } + } + else if (ld.type == "Split" || ld.type == "Slice" || ld.type == "Crop") + { + std::vector inp_sc; std::vector inp_zp; + impl->getQuantizationParams(*ld.inputBlobs[0], inp_sc, inp_zp); + sc.assign(ld.outputBlobs.size(), inp_sc[0]); + zp.assign(ld.outputBlobs.size(), inp_zp[0]); + } + else + { + for (int i = 0; i < ld.outputBlobs.size(); i++) + impl->getQuantizationParams(ld.outputBlobs[i], sc, zp); + } + scales.push_back(sc); + zeropoints.push_back(zp); + } + + // For some layers, the input and output scales/zeropoints must be equal so that rescaling of inputs + // is not needed during quantized inference. We start from the last layer and modify the layer's input scales/zeropoints + // TODO : Need a different approach. Current solution fails when 2 such layers have the same input layer + for (Impl::MapIdToLayerData::reverse_iterator it = impl->layers.rbegin(); it != impl->layers.rend(); ++it) + { + LayerData& ld = it->second; + // Layers with multiple outputs. Number of outputs is equal to number of inputs + if (ld.type == "Blank" || ld.type == "Dropout" || ld.type == "Identity" || ld.type == "Silence" || + ld.type == "Flatten" || ld.type == "Padding" || ld.type == "Permute" || ld.type == "Reshape" || + ld.type == "ReLU6" || ld.type == "Reorg" || ld.type == "ShuffleChannel" || + (ld.type == "ReLU" && !ld.params.get("negative_slope", 0.f)) /* ReLU with negative slope 0 */) + { + for (int i = 0; i < ld.outputBlobs.size(); i++) + { + LayerPin &pin = ld.inputBlobsId[i]; + scales[pin.lid][pin.oid] = scales[ld.id][i]; + zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][i]; + } + } + // Layers with multiple inputs and single output. + else if ((ld.type == "Pooling" && toLowerCase(ld.params.get("pool", "max")) == "max") /* Max Pooling */ || + (ld.type == "Eltwise" && toLowerCase(ld.params.get("operation", "sum")) == "max") /* Elementwise max */ || + ld.type == "Concat") + { + for (int i = 0; i < ld.inputBlobsId.size(); i++) + { + LayerPin &pin = ld.inputBlobsId[i]; + scales[pin.lid][pin.oid] = scales[ld.id][0]; + zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][0]; + } + } + } + + // Create a new Net and add quantized layers to it. + Net dstNet; + dstNet.impl->netWasQuantized = true; + dstNet.setInputsNames(impl->netInputLayer->outNames); + dstNet.setPreferableBackend(prefBackend); + dstNet.setPreferableTarget(prefTarget); + dstNet.enableFusion(originalFusion); + + for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++) + { + LayerData ld = it->second; + if (ld.id == 0) + { + LayerData &quantInpLd = dstNet.impl->layers[0]; + quantInpLd.dtype = inputsDtype; + quantInpLd.params.set("scales", DictValue::arrayReal(scales[0].data(), scales[0].size())); + quantInpLd.params.set("zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size())); + continue; + } + + std::vector inpPins = ld.inputBlobsId; + // Fill input and output scales/zeropoints for the layer + std::vector > inp_out_sc(2); + std::vector > inp_out_zp(2); + for (int i = 0; i < inpPins.size(); i++) + { + LayerPin &pin = inpPins[i]; + inp_out_sc[0].push_back(scales[pin.lid][pin.oid]); + inp_out_zp[0].push_back(zeropoints[pin.lid][pin.oid]); + } + inp_out_sc[1] = scales[ld.id]; + inp_out_zp[1] = zeropoints[ld.id]; + + // Quantize layer + Ptr layer = ld.layerInstance; + if (layer->tryQuantize(inp_out_sc, inp_out_zp, ld.params)) + { + ld.type += "Int8"; + ld.dtype = CV_8S; + } + ld.params.set("scales", DictValue::arrayReal(inp_out_sc[1].data(), inp_out_sc[1].size())); + ld.params.set("zeropoints", DictValue::arrayInt(inp_out_zp[1].data(), inp_out_zp[1].size())); + + // Check and add quantize/dequantize node before layer + for (int i = 0; i < inpPins.size(); i++) + { + LayerPin &pin = inpPins[i]; + LayerData &inpLd = dstNet.impl->getLayerData(impl->getLayerName(pin.lid)); + pin.lid = inpLd.id; + if (inpLd.dtype != ld.dtype) + { + String layerName = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? cv::format("quantize/%s/%d", inpLd.name.c_str(), pin.oid) + : cv::format("dequantize/%s/%d", inpLd.name.c_str(), pin.oid); + // Check if quantize/dequantize node for the input layer already exists + if (dstNet.impl->getLayerId(layerName) >= 0) + { + pin.lid = dstNet.impl->getLayerId(layerName); + pin.oid = 0; + } + else + { + LayerParams lp; + lp.set("scales", inp_out_sc[0][i]); + lp.set("zeropoints", inp_out_zp[0][i]); + lp.name = layerName; + lp.type = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? "Quantize" : "Dequantize"; + int newLid = dstNet.addLayer(lp.name, lp.type, ld.dtype, lp); + dstNet.connect(pin.lid, pin.oid, newLid, 0); + pin.lid = newLid; pin.oid = 0; + } + } + } + + // Add quantized layer to Net and connect to its inputs. + int newLid = dstNet.addLayer(ld.name, ld.type, ld.dtype, ld.params); + for( int i = 0; i < inpPins.size(); i++ ) + dstNet.connect(inpPins[i].lid, inpPins[i].oid, newLid, i); + + // If the layer is a output layer, add quantize/dequantize node after it based on output's data type. + if (ld.requiredOutputs.size() == 0 && ld.dtype != outputsDtype) + { + LayerParams lp; + lp.set("scales", inp_out_sc[1][0]); + lp.set("zeropoints", inp_out_zp[1][0]); + lp.name = ((ld.dtype == CV_32F && outputsDtype == CV_8S) ? "quantize/" : "dequantize/") + ld.name; + lp.type = (ld.dtype == CV_32F && outputsDtype == CV_8S) ? "Quantize" : "Dequantize"; + dstNet.addLayerToPrev(lp.name, lp.type, outputsDtype, lp); + } + } + // Restore FP32 Net's backend, target and fusion + setPreferableBackend(prefBackend); + setPreferableTarget(prefTarget); + enableFusion(originalFusion); + return dstNet; +} + +void Net::getInputDetails(std::vector& scales, std::vector& zeropoints) const +{ + if (!impl->netWasQuantized) + CV_Error(Error::StsBadFunc, "Net isn't quantized"); + + LayerParams &lp = impl->layers[0].params; + DictValue sc = lp.get("scales"); + DictValue zp = lp.get("zeropoints"); + + for (int i = 0; i < sc.size(); i++) + { + scales.push_back(sc.get(i)); + zeropoints.push_back(zp.get(i)); + } +} + +void Net::getOutputDetails(std::vector& scales, std::vector& zeropoints) const +{ + if (!impl->netWasQuantized) + CV_Error(Error::StsBadFunc, "Net isn't quantized"); + + std::vector outLayerIds = getUnconnectedOutLayers(); + for (auto &lid : outLayerIds) + { + LayerParams &lp = impl->layers[lid].params; + DictValue sc = lp.get("scales"); + DictValue zp = lp.get("zeropoints"); + + for (int i = 0; i < sc.size(); i++) + { + scales.push_back(sc.get(i)); + zeropoints.push_back(zp.get(i)); + } + } +} + void Net::setPreferableBackend(int backendId) { CV_TRACE_FUNCTION(); CV_TRACE_ARG(backendId); + if (backendId == DNN_BACKEND_DEFAULT) + backendId = (Backend)PARAM_DNN_BACKEND_DEFAULT; + + if (impl->netWasQuantized && backendId != DNN_BACKEND_OPENCV) + { + CV_LOG_WARNING(NULL, "DNN: Only default backend supports quantized networks"); + backendId = DNN_BACKEND_OPENCV; + } + #ifdef HAVE_INF_ENGINE if (backendId == DNN_BACKEND_INFERENCE_ENGINE) backendId = getInferenceEngineBackendTypeParam(); @@ -4277,7 +4583,6 @@ void Net::setPreferableBackend(int backendId) if( impl->preferableBackend != backendId ) { impl->preferableBackend = backendId; - impl->netWasAllocated = false; impl->clear(); } } @@ -4287,6 +4592,13 @@ void Net::setPreferableTarget(int targetId) CV_TRACE_FUNCTION(); CV_TRACE_ARG(targetId); + if (impl->netWasQuantized && targetId != DNN_TARGET_CPU && + targetId != DNN_TARGET_OPENCL && targetId != DNN_TARGET_OPENCL_FP16) + { + CV_LOG_WARNING(NULL, "DNN: Only CPU and OpenCL/OpenCL FP16 target is supported by quantized networks"); + targetId = DNN_TARGET_CPU; + } + if( impl->preferableTarget != targetId ) { impl->preferableTarget = targetId; @@ -4306,7 +4618,6 @@ void Net::setPreferableTarget(int targetId) impl->preferableTarget = DNN_TARGET_OPENCL; #endif } - impl->netWasAllocated = false; impl->clear(); } } @@ -4935,9 +5246,10 @@ void Net::getMemoryConsumption(const int layerId, ShapesVec inLayerShapes, outLayerShapes; getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes); + size_t elemSize = (impl->netWasQuantized) ? sizeof(char) : sizeof(float); for(int i = 0; i < outLayerShapes.size(); i++) { - blobs += total(outLayerShapes[i]) * sizeof(float); + blobs += total(outLayerShapes[i]) * elemSize; } } @@ -4986,7 +5298,7 @@ void Net::getMemoryConsumption(const std::vector& netInputShapes, std::vector > inLayerShapes, outLayerShapes; getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes); - + size_t elemSize = (impl->netWasQuantized) ? sizeof(char) : sizeof(float); for(int i = 0; i < layerIds.size(); i++) { int w = 0, b = 0; @@ -5001,7 +5313,7 @@ void Net::getMemoryConsumption(const std::vector& netInputShapes, for(int j = 0; j < outLayerShapes[i].size(); j++) { - b += total(outLayerShapes[i][j]) * sizeof(float); + b += total(outLayerShapes[i][j]) * elemSize; } weights.push_back(w); @@ -5021,7 +5333,6 @@ void Net::enableFusion(bool fusion) if( impl->fusion != fusion ) { impl->fusion = fusion; - impl->netWasAllocated = false; impl->clear(); } } @@ -5195,6 +5506,12 @@ void Layer::getScaleShift(Mat& scale, Mat& shift) const shift = Mat(); } +void Layer::getScaleZeropoint(float& scale, int& zeropoint) const +{ + scale = 1.f; + zeropoint = 0; +} + void Layer::unsetAttached() { setActivation(Ptr()); @@ -5321,6 +5638,12 @@ void Layer::run(const std::vector &inputs, std::vector &outputs, std:: this->forward(inputs, outputs, internals); } +bool Layer::tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) +{ + return false; +} + Layer::~Layer() {} bool Layer::getMemoryShapes(const std::vector &inputs, diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp index 1916aa0ec9..9d8a3783a2 100644 --- a/modules/dnn/src/init.cpp +++ b/modules/dnn/src/init.cpp @@ -141,6 +141,44 @@ void initializeLayerFactory() CV_DNN_REGISTER_LAYER_CLASS(LSTM, LSTMLayer); CV_DNN_REGISTER_LAYER_CLASS(GRU, GRULayer); CV_DNN_REGISTER_LAYER_CLASS(CumSum, CumSumLayer); + + CV_DNN_REGISTER_LAYER_CLASS(Quantize, QuantizeLayer); + CV_DNN_REGISTER_LAYER_CLASS(Dequantize, DequantizeLayer); + CV_DNN_REGISTER_LAYER_CLASS(ConvolutionInt8, ConvolutionLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(InnerProductInt8, InnerProductLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(PoolingInt8, PoolingLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(EltwiseInt8, EltwiseLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(BatchNormInt8, BatchNormLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(ScaleInt8, ScaleLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(ShiftInt8, ShiftLayerInt8); + + CV_DNN_REGISTER_LAYER_CLASS(ReLUInt8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(ReLU6Int8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(SigmoidInt8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(TanHInt8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(SwishInt8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(MishInt8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(ELUInt8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(BNLLInt8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(AbsValInt8, ActivationLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(SoftmaxInt8, SoftmaxLayerInt8); + CV_DNN_REGISTER_LAYER_CLASS(SoftMaxInt8, SoftmaxLayerInt8); + + CV_DNN_REGISTER_LAYER_CLASS(ConcatInt8, ConcatLayer); + CV_DNN_REGISTER_LAYER_CLASS(FlattenInt8, FlattenLayer); + CV_DNN_REGISTER_LAYER_CLASS(PaddingInt8, PaddingLayer); + CV_DNN_REGISTER_LAYER_CLASS(BlankInt8, BlankLayer); + CV_DNN_REGISTER_LAYER_CLASS(DropoutInt8, BlankLayer); + CV_DNN_REGISTER_LAYER_CLASS(IdentityInt8, BlankLayer); + CV_DNN_REGISTER_LAYER_CLASS(SilenceInt8, BlankLayer); + CV_DNN_REGISTER_LAYER_CLASS(ConstInt8, ConstLayer); + CV_DNN_REGISTER_LAYER_CLASS(ReshapeInt8, ReshapeLayer); + CV_DNN_REGISTER_LAYER_CLASS(SplitInt8, SplitLayer); + CV_DNN_REGISTER_LAYER_CLASS(SliceInt8, SliceLayer); + CV_DNN_REGISTER_LAYER_CLASS(CropInt8, CropLayer); + CV_DNN_REGISTER_LAYER_CLASS(PermuteInt8, PermuteLayer); + CV_DNN_REGISTER_LAYER_CLASS(ReorgInt8, ReorgLayer); + CV_DNN_REGISTER_LAYER_CLASS(ShuffleChannelInt8, ShuffleChannelLayer); } CV__DNN_INLINE_NS_END diff --git a/modules/dnn/src/int8layers/batch_norm_layer.cpp b/modules/dnn/src/int8layers/batch_norm_layer.cpp new file mode 100644 index 0000000000..c5b8c3d9e9 --- /dev/null +++ b/modules/dnn/src/int8layers/batch_norm_layer.cpp @@ -0,0 +1,178 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" +#include + +namespace cv +{ +namespace dnn +{ + +class BatchNormLayerInt8Impl CV_FINAL : public BatchNormLayerInt8 +{ +public: + Mat origin_weights, origin_bias; + Mat weights_, bias_; + mutable int dims; + + BatchNormLayerInt8Impl(const LayerParams& params) + : dims(-1) + { + setParamsFrom(params); + useGlobalStats = params.get("use_global_stats", true); + input_sc = params.get("input_scale"); + input_zp = params.get("input_zeropoint"); + output_sc = params.get("scales"); + output_zp = params.get("zeropoints"); + + CV_Assert(blobs.size() == 2); + size_t n = blobs[0].total(); + CV_Assert(blobs[1].total() == n && + blobs[0].isContinuous() && blobs[1].isContinuous() && + blobs[0].type() == CV_32F && blobs[1].type() == CV_32F); + + origin_weights = blobs[0]; + origin_bias = blobs[1]; + } + + virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE + { + origin_weights.convertTo(weights_, CV_32F, input_sc/output_sc); + addWeighted(origin_bias, 1.0/output_sc, weights_, -input_zp, output_zp, bias_, CV_32F); + } + + void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE + { + scale = origin_weights; + shift = origin_bias; + } + + void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE + { + scale = output_sc; + zeropoint = output_zp; + } + + virtual bool tryFuse(Ptr& top) CV_OVERRIDE + { + Mat w_, b_; + top->getScaleShift(w_, b_); + if (w_.empty() && b_.empty()) + return false; + + const int numChannels = weights_.total(); + const int numFusedWeights = w_.total(); + const int numFusedBias = b_.total(); + + if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w_.empty()) || + (numFusedBias != numChannels && numFusedBias != 1 && !b_.empty())) + return false; + + float new_sc; + int new_zp; + top->getScaleZeropoint(new_sc, new_zp); + + Mat w = numFusedWeights == 1 ? Mat(1, numChannels, CV_32F, Scalar(w_.at(0))) : + (w_.empty() ? Mat::ones(1, numChannels, CV_32F) : w_.reshape(1, 1)); + + Mat b = numFusedBias == 1 ? Mat(1, numChannels, CV_32F, Scalar(b_.at(0))) : + (b_.empty() ? Mat::zeros(1, numChannels, CV_32F) : b_.reshape(1, 1)); + + weights_ = Mat(); bias_ = Mat(); + multiply(origin_weights, w, weights_, input_sc/new_sc, CV_32F); + multiply(origin_bias, w, bias_); + add(bias_, b, bias_); + addWeighted(bias_, 1.0/new_sc, weights_, -input_zp, new_zp, bias_, CV_32F); + return true; + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + dims = inputs[0].size(); + if (!useGlobalStats && inputs[0][0] != 1) + CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1"); + Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); + return true; + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV; + } + + bool setActivation(const Ptr& layer) CV_OVERRIDE + { + Ptr activ_int8 = layer.dynamicCast(); + if (!activ_int8.empty()) + { + return activ_int8->blobs.empty(); + } + return false; + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + CV_Assert(blobs.size() == 2); + CV_Assert(inputs.size() == 1); + + Mat &inpBlob = inputs[0]; + int planeSize = 1; + for (size_t i = 2; i < inpBlob.dims; i++) { + planeSize *= inpBlob.size[i]; + } + + for (size_t ii = 0; ii < outputs.size(); ii++) + { + Mat &outBlob = outputs[ii]; + + for(int num = 0; num < outBlob.size[0]; num++) + { + for (int n = 0; n < outBlob.size[1]; n++) + { + float w = weights_.at(n); + float b = bias_.at(n); + Mat inpBlobPlane(1, planeSize, CV_8S, inpBlob.ptr(num, n)); + Mat outBlobPlane(1, planeSize, CV_8S, outBlob.ptr(num, n)); + inpBlobPlane.convertTo(outBlobPlane, CV_8S, w, b); + } + } + } + } + + virtual int64 getFLOPS(const std::vector &inputs, + const std::vector &outputs) const CV_OVERRIDE + { + CV_UNUSED(outputs); // suppress unused variable warning + + int64 flops = 0; + for(int i = 0; i < inputs.size(); i++) + { + flops += 3*total(inputs[i]); + } + return flops; + } + +private: + bool useGlobalStats; +}; + +Ptr BatchNormLayerInt8::create(const LayerParams& params) +{ + return Ptr(new BatchNormLayerInt8Impl(params)); +} + +} // namespace dnn +} // namespace cv diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp new file mode 100644 index 0000000000..05749885c0 --- /dev/null +++ b/modules/dnn/src/int8layers/convolution_layer.cpp @@ -0,0 +1,1136 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" + +#include + +#include "opencv2/core/hal/hal.hpp" +#include "opencv2/core/hal/intrin.hpp" +#include +#include + +namespace cv +{ +namespace dnn +{ + +#if CV_SIMD +static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b, + v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3) +{ + v_int16x8 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + + v_int32x4 t0, t1; + v_mul_expand(a0, b0, t0, t1); + out0 += t0; out1 += t1; + + v_mul_expand(a1, b1, t0, t1); + out2 += t0; out3 += t1; +} +#endif + +class BaseConvolutionLayerInt8Impl : public ConvolutionLayerInt8 +{ +public: + BaseConvolutionLayerInt8Impl(const LayerParams ¶ms) + { + setParamsFrom(params); + getConvolutionKernelParams(params, kernel_size, pads_begin, pads_end, strides, dilations, padMode, adjust_pads); + + numOutput = params.get("num_output"); + int ngroups = params.get("group", 1); + CV_Assert(numOutput % ngroups == 0); + + input_zp = params.get("input_zeropoint"); + output_zp = params.get("zeropoints"); + output_sc = params.get("scales"); + + if (kernel_size.size() == 2) { + kernel = Size(kernel_size[1], kernel_size[0]); + stride = Size(strides[1], strides[0]); + for (int i = 0; i < pads_begin.size(); i++) { + if (pads_begin[i] != pads_end[i]) + CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer"); + } + pad = Size(pads_begin[1], pads_begin[0]); + dilation = Size(dilations[1], dilations[0]); + + adjustPad.height = adjust_pads[0]; + adjustPad.width = adjust_pads[1]; + } + + for (int i = 0; i < adjust_pads.size(); i++) { + CV_Assert(adjust_pads[i] < strides[i]); + } + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE + { + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + // blobs[0] - Weights (INT8) + // blobs[1] - Biases (INT32) + // blobs[2] - Multipliers for convolution output stage (FP32) + CV_Assert(!inputs.empty() && blobs.size() == 3); + MatSize weightShape = blobs[0].size; + + CV_Assert(inputs[0].dims == outputs[0].dims); + if (weightShape.dims() == 3) + { + kernel_size.assign(1, kernel_size[0]); + strides.assign(1, strides[0]); + dilations.assign(1, dilations[0]); + pads_begin.assign(1, pads_begin[0]); + pads_end.assign(1, pads_end[0]); + } + CV_Assert(weightShape.dims() == kernel_size.size() + 2); + for (int i = 0; i < kernel_size.size(); i++) { + CV_Assert(weightShape[i + 2] == kernel_size[i]); + } + + const Mat &input = inputs[0]; + CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && input.type() == CV_8S); + for (size_t i = 0; i < outputs.size(); i++) + { + CV_Assert(inputs[i].type() == input.type()); + CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]); + for (int j = 0; j < inputs[i].dims; j++) { + CV_Assert(inputs[i].size[j] == input.size[j]); + } + } + + std::vector inpShape; + std::vector outShape; + for (int i = 2; i < inputs[0].dims; i++) { + inpShape.push_back(inputs[0].size[i]); + outShape.push_back(outputs[0].size[i]); + } + getConvPoolPaddings(inpShape, kernel_size, strides, padMode, pads_begin, pads_end); + if (pads_begin.size() == 2) { + for (int i = 0; i < pads_begin.size(); i++) { + if (pads_begin[i] != pads_end[i]) + CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer"); + } + pad = Size(pads_begin[1], pads_begin[0]); + } + } + + virtual MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const = 0; + bool is1x1() const + { + return (kernel.height == 1 && kernel.width == 1) && + (stride.height == 1 && stride.width == 1) && + (dilation.height == 1 && dilation.width == 1); + } + + virtual bool tryFuse(Ptr& top) CV_OVERRIDE + { + Mat w, b; + top->getScaleShift(w, b); + if (w.empty() && b.empty()) + return false; + + CV_Assert((w.empty() || w.type() == CV_32F) && + (b.empty() || b.type() == CV_32F)); + + float new_sc; + int new_zp; + top->getScaleZeropoint(new_sc, new_zp); + fuseWeights(w, b, new_sc); + output_sc = new_sc; + output_zp = new_zp; + return true; + } + + virtual void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) = 0; +}; + +//TODO: simultaneously convolution and bias addition for cache optimization +class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl +{ +public: + enum { VEC_ALIGN = 32, DFT_TYPE = CV_8S }; + Mat weightsMat; + std::vector biasvec; + Mat outputMultiplier; + Mat activationLUT; + Ptr activ; + + ConvolutionLayerInt8Impl(const LayerParams ¶ms) : BaseConvolutionLayerInt8Impl(params){} + + MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE + { + CV_Assert(!blobs.empty()); + int dims = inpShape.size(); + int inpD = dims == 5 ? inpShape[2] : 1; + int inpH = inpShape[dims - 2]; + int inpW = inpShape.back(); + int inpGroupCn = blobs[0].size[1]; + int ksize = inpGroupCn * std::accumulate(kernel_size.begin(), kernel_size.end(), + 1, std::multiplies()); + return shape(inpD * inpH * inpW, ksize); + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + size_t ksize = kernel_size.size(); + // Only default backend and Conv1D/Conv2D/Conv3D are supported + return backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3; + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + CV_Assert(!blobs.empty()); + const int* weightShape = blobs[0].size.p; + CV_Assert(blobs[1].total() == (size_t)weightShape[0]); + + internals.clear(); + + CV_Assert(inputs.size() != 0); + std::vector inpShape(inputs[0].begin() + 2, inputs[0].end()); + + int outCn = weightShape[0]; + std::vector outShape; + outShape.push_back(inputs[0][0]); + outShape.push_back(outCn); + + int inpCn = inputs[0][1]; + if (padMode.empty()) + { + for (int i = 0; i < inpShape.size(); i++) + outShape.push_back((inpShape[i] + pads_begin[i] + pads_end[i] - dilations[i] * (kernel_size[i] - 1) - 1) / strides[i] + 1); + } + else + { + getConvPoolOutParams(inpShape, kernel_size, strides, padMode, dilations, outShape); + } + + int ngroups = inpCn / weightShape[1]; + if (ngroups == 0 || ngroups * weightShape[1] != inpCn) + CV_Error(Error::StsError, format("Number of input channels should " + "be multiple of %d but got %d", weightShape[1], inpCn)); + CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0); + + outputs.resize(1, outShape); + + return false; + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE + { + BaseConvolutionLayerInt8Impl::finalize(inputs_arr, outputs_arr); + + std::vector inputs; + inputs_arr.getMatVector(inputs); + // prepare weightsMat where each row is aligned and has enough zero padding on the right to + // use vectorized (i.e. with intrinsics) loops without tail processing + Mat wm = blobs[0].reshape(1, numOutput); + if( wm.step1() % VEC_ALIGN != 0 ) + { + int newcols = (int)alignSize(wm.step1(), VEC_ALIGN); + Mat wm_buffer = Mat(numOutput, newcols, wm.type()); + Mat wm_padding = wm_buffer.colRange(wm.cols, newcols); + wm_padding.setTo(Scalar::all(0)); + Mat wm_aligned = wm_buffer.colRange(0, wm.cols); + wm.copyTo(wm_aligned); + wm = wm_aligned; + } + weightsMat = wm; + + Mat biasMat = blobs[1]; + biasvec.resize(numOutput+2); + for(int i = 0; i < numOutput; i++ ) + biasvec[i] = biasMat.at(i); + + outputMultiplier = blobs[2]; + } + + bool setActivation(const Ptr& layer) CV_OVERRIDE + { + Ptr activ_int8 = layer.dynamicCast(); + if (!activ_int8.empty()) + { + activ = activ_int8; + if (!activ_int8->blobs.empty()) + activ_int8->blobs[0].convertTo(activationLUT, CV_32S); + return true; + } + return false; + } + + virtual bool tryFuse(Ptr& top) CV_OVERRIDE + { + return BaseConvolutionLayerInt8Impl::tryFuse(top); + } + + void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) CV_OVERRIDE + { + const int outCn = weightsMat.size[0]; + Mat w = w_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(w_.at(0))) : w_; + Mat b = b_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(b_.at(0))) : b_; + CV_Assert_N(!weightsMat.empty(), biasvec.size() == outCn + 2, + w.empty() || outCn == w.total(), b.empty() || outCn == b.total()); + + for (int i = 0; i < outCn; ++i) + { + float off = outputMultiplier.at(i) * output_sc; + if (!w.empty()) + off *= w.at(i); + + if (!b.empty()) + biasvec[i] += (int)std::round(b.at(i)/off); + + outputMultiplier.at(i) = off/new_sc; + } + biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1]; + } + + class ParallelConv : public cv::ParallelLoopBody + { + public: + enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 }; + + const Mat* input_; + const Mat* weights_; + Mat* output_; + int outShape[4]; // used only for conv2d + std::vector kernel_size, pads_begin, pads_end, strides, dilations; + int ngroups_, nstripes_; + std::vector ofstab_; + const std::vector* biasvec_; + const Mat* activLUT_; + const ActivationLayerInt8* activ_; + bool is1x1_; + bool useAVX2; + bool useAVX512; + int blk_size_cn; + int inpZp, outZp; + const float* multiplier; + + ParallelConv() + : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), + biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false) + , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0) + {} + + static void run( const Mat& input, Mat& output, const Mat& weights, const Mat& multipliers, + const std::vector& biasvec, const Mat& activLUT, + const std::vector& kernel_size, const std::vector& strides, + const std::vector& pads_begin, const std::vector& pads_end, + const std::vector& dilations, + const ActivationLayerInt8* activ, int ngroups, int nstripes, int inp_Zp, int out_Zp) + { + size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(), + 1, std::multiplies()); + bool isConv1D = input.dims == 3; + bool isConv2D = input.dims == 4; + bool isConv3D = input.dims == 5; + CV_CheckEQ(static_cast(kernel_size.size()), input.dims - 2, ""); + CV_Assert_N(input.dims == output.dims, + input.size[0] == output.size[0], + weights.rows == output.size[1], + weights.cols == (input.size[1]/ngroups)*karea, + input.type() == CV_8SC1, + output.type() == CV_32SC1, + input.type() == weights.type(), + input.isContinuous(), + output.isContinuous(), + biasvec.size() == (size_t)output.size[1]+2); + CV_Check(weights.step1(), weights.step1() % VEC_ALIGN == 0, ""); + ParallelConv p; + + p.input_ = &input; + p.weights_ = &weights; + p.output_ = &output; + int max_ind = isConv1D? 3: 4; + for( int i = 0; i < max_ind; i++ ) p.outShape[i] = output.size[i]; + p.outShape[1] /= ngroups; + + p.kernel_size = kernel_size; p.strides = strides; p.dilations = dilations; + p.pads_begin = pads_begin; p.pads_end = pads_end; + + p.ngroups_ = ngroups; + p.nstripes_ = nstripes; + + int inpCnAll = input.size[1]; + int depth = (input.dims == 5) ? input.size[2] : 1; + int width = input.size[input.dims - 1]; + int height = isConv1D? 1 : input.size[input.dims - 2]; + int inpCn = inpCnAll / ngroups; + + p.is1x1_ = (isConv2D && kernel_size[0] == 1 && kernel_size[1] == 1 && + pads_begin[0] == 0 && pads_begin[1] == 0) || + (isConv1D && pads_begin[0] == 0 && kernel_size[0] == 1); + + p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D; + p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D; + + int kernel_d = isConv3D? kernel_size[0] : 1; + int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; + int kernel_w = kernel_size.back(); + + int blk_size_cn0 = cvCeil(1600./(kernel_w*kernel_h)); + int ncn = 32; + while (ncn*2 < blk_size_cn0 && ncn < inpCn) + ncn *= 2; + ncn = std::min(ncn, inpCn); + p.blk_size_cn = ncn; + + int dil_d = isConv3D? dilations[0] : 1; + int dil_h = isConv1D? 1 : dilations[dilations.size() - 2]; + int dil_w = dilations.back(); + + p.inpZp = inp_Zp; + p.outZp = out_Zp; + p.multiplier = multipliers.ptr(0); + + p.ofstab_.resize(karea * ncn); + int* ofstab = &p.ofstab_[0]; + + if (isConv1D) + { + for( int k = 0; k < ncn; k++ ) + for( int k_c = 0; k_c < kernel_w; k_c++ ) + ofstab[k*kernel_w + k_c] = k*width + k_c*dil_w; + } + else if (isConv2D) + { + for( int k = 0; k < ncn; k++ ) + for( int k_r = 0; k_r < kernel_h; k_r++ ) + for( int k_c = 0; k_c < kernel_w; k_c++ ) + ofstab[(k*kernel_h + k_r)*kernel_w + k_c] = + (k*height + k_r*dil_h)*width + k_c*dil_w; + } + else + { + for( int k = 0; k < ncn; k++ ) + for (int k_d = 0; k_d < kernel_d; k_d++) + for( int k_r = 0; k_r < kernel_h; k_r++ ) + for( int k_c = 0; k_c < kernel_w; k_c++ ) + ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] = + (k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w; + } + + p.biasvec_ = &biasvec; + p.activLUT_ = &activLUT; + p.activ_ = !activLUT.empty() ? activ : 0; + + parallel_for_(Range(0, nstripes), p, nstripes); + } + + virtual void operator ()(const Range &r0) const CV_OVERRIDE + { + const int valign = ConvolutionLayerInt8Impl::VEC_ALIGN; + int ngroups = ngroups_, batchSize = input_->size[0]*ngroups; + bool isConv1D = input_->dims == 3; + bool isConv2D = input_->dims == 4; + bool isConv3D = input_->dims == 5; + + int outW = output_->size[output_->dims - 1]; + int outH = isConv1D? 1 : output_->size[output_->dims - 2]; + int outCn = output_->size[1]/ngroups; + + int depth = isConv3D? input_->size[2] : 1; + int height = isConv1D? 1 : input_->size[input_->dims - 2]; + int width = input_->size[input_->dims - 1]; + int inpCn = input_->size[1]/ngroups; + + const int nstripes = nstripes_; + + int kernel_d = isConv3D? kernel_size[0] : 1; + int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; + int kernel_w = kernel_size.back(); + int karea = kernel_w*kernel_h*kernel_d; + + int pad_d = isConv3D? pads_begin[0] : 0; + int pad_t = isConv1D? 0 : pads_begin[pads_begin.size() - 2]; + int pad_l = pads_begin.back(); + + int stride_d = isConv3D? strides[0] : 0; + int stride_h = isConv1D? 0 : strides[strides.size() - 2]; + int stride_w = strides.back(); + + int dilation_d = isConv3D? dilations[0] : 1; + int dilation_h = isConv1D? 1 : dilations[dilations.size() - 2]; + int dilation_w = dilations.back(); + + int i, j, k, d; + int inpPlaneSize = (int)input_->total(2); + int outPlaneSize = (int)output_->total(2); + bool is1x1 = is1x1_; + + int stripesPerSample; + int stripeSize; + Range r = r0; + bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 && + outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 && + width >= 16 + dilation_w*(kernel_w - 1); + // for now only 3x3 depth-wise convolutions are supported + depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 && + // computing at most 1 pixel from each side can involve padding + max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t && + pad_l <= 1 && pad_t <= 1; + + if( !depthWiseConvolution && nstripes >= batchSize*2 ) + { + stripesPerSample = nstripes/batchSize; + stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, 8); + stripeSize = std::min(stripeSize, outPlaneSize); + } + else + { + stripesPerSample = 1; + int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1); + r.start *= samplesPerStripe; + r.end *= samplesPerStripe; + stripeSize = outPlaneSize; + } + + const int8_t* data_inp0_ = input_->ptr(); + const int* ofstab = &ofstab_[0]; + const int8_t* wptr_orig_ = weights_->ptr(); + size_t wstep = weights_->step1(); + const int* biasptr_ = &biasvec_->at(0); + const int* lutptr_ = !activLUT_->empty() ? activLUT_->ptr() : 0; + int* data_out0_ = output_->ptr(); + AutoBuffer rowbuf0_; + int8_t* rowbuf0 = 0; + bool use_rowbuf = !depthWiseConvolution; + int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize); + + // im2row buffer is not used for depth-wise convolution + if(use_rowbuf) + { + size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size); + //printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize); + rowbuf0_.allocate(rowbufsz + valign); + rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(int8_t))); + // we clear the buffer once; ultimately, it lets us to avoid + // tail processing after running the unrolled/vectorized loop. + // the main idea is to make sure that the tail (a.k.a. padding) of each row + // (i.e. the elements with indices between vsz=karea*ncn and vsz_a) + // does not contain NaNs or Infs. Because the padding in the weights + // matrix is explicitly initialized with 0's, we handle all other + // cases nicely, i.e. we can skip expliciting re-initialization + // of the padding - we just retain elements from the previous iteration + // of the loop over channels (cn0). + memset(rowbuf0, (int8_t)inpZp, rowbufsz*sizeof(rowbuf0[0]) ); + } + + for( int stripe = r.start; stripe < r.end; stripe++ ) + { + int subsampleIdx = stripe/stripesPerSample; + if( subsampleIdx >= batchSize ) + break; + int stripeStart = (int)((stripe - subsampleIdx*stripesPerSample)*stripeSize); + int stripeEnd = (int)std::min(stripeStart + stripeSize, outPlaneSize); + const int8_t* data_inp0 = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn; + int* data_out0 = data_out0_ + subsampleIdx*outPlaneSize*outCn; + int startOutCn = (subsampleIdx % ngroups)*outCn; + const int8_t* wptr_orig = wptr_orig_ + wstep*startOutCn; + const int* biasptr = biasptr_ + startOutCn; + const float* multptr = multiplier + startOutCn; + + for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn ) + { + int cn1 = std::min(cn0 + blk_size_cn, inpCn); + int ncn = cn1 - cn0, vsz = karea*ncn; + int vsz_a = (int)alignSize(vsz, valign); + const int8_t* wptr = wptr_orig + cn0*karea; + + for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size ) + { + int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd); + int bsz = ofs1 - ofs0; + + int out_d = ofs0 / (outH * outW); + int out_i = (ofs0 - out_d * outH * outW) / outW; + int out_j = ofs0 % outW; + + if (depthWiseConvolution) + { + CV_Assert(out_i == 0 && out_j == 0); + int in_d = out_d * stride_d - pad_d; + const int8_t* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width; + int* outptr_ = data_out0 + ofs0; + + #if CV_TRY_AVX2 + if(useAVX2) + opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w, + stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, + biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp); + else + #endif + { + const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + int bias = biasptr[out_d], biasCopy; + float mult = multptr[out_d]; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const int8_t* imgptr0 = inptr_ + in_i*width; + const int8_t* imgptr1 = imgptr0 + dilation_h*width; + const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width; + int8_t w00 = w00_, w01 = w01_, w02 = w02_; + int8_t w20 = w20_, w21 = w21_, w22 = w22_; + int out, out1; + // Bias has a fused offset component. bias = bias_quantized - input_zeropoint*sum_of_weights. + // In some cases below, certain weights are not used for convolution or set to zero. + // So we create a copy of bias at the start and remove the weight's components as necessary. + biasCopy = bias; + + if (in_i < 0) + { + biasCopy += inpZp * (w00 + w01 + w02); + w00 = w01 = w02 = 0; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + biasCopy += inpZp * (w20 + w21 + w22); + w20 = w21 = w22 = 0; + imgptr2 = imgptr1; + } + int* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 + + (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 + + (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 + + biasCopy + inpZp*(w00 + w10 + w20); + out1 = outZp + (int)std::round(out*mult); + outptr[0] = std::min(std::max(out1, -128), 127); + out_j = 1; + } + #if CV_SIMD + if( stride_w == 1 ) + { + const int out_delta = 16; + v_int8x16 vw00 = v_setall_s8(w00), vw01 = v_setall_s8(w01), vw02 = v_setall_s8(w02), + vw10 = v_setall_s8(w10), vw11 = v_setall_s8(w11), vw12 = v_setall_s8(w12), + vw20 = v_setall_s8(w20), vw21 = v_setall_s8(w21), vw22 = v_setall_s8(w22); + v_int32x4 vout0, vout1, vout2, vout3, vbias = v_setall_s32(biasCopy), voutzp = v_setall_s32(outZp), + outmin = v_setall_s32(-128), outmax = v_setall_s32(127); + v_float32x4 vmult = v_setall_f32(mult); + for( ; out_j < outW1; out_j += out_delta ) + { + if (out_j + out_delta > outW1) + { + if (out_j <= pad_l) + break; + out_j = outW1 - out_delta; + } + int in_j = out_j * stride_w - pad_l; + v_int8x16 v00 = v_load(imgptr0 + in_j), + v01 = v_load(imgptr0 + in_j + dilation_w), + v02 = v_load(imgptr0 + in_j + dilation_w*2), + v10 = v_load(imgptr1 + in_j), + v11 = v_load(imgptr1 + in_j + dilation_w), + v12 = v_load(imgptr1 + in_j + dilation_w*2), + v20 = v_load(imgptr2 + in_j), + v21 = v_load(imgptr2 + in_j + dilation_w), + v22 = v_load(imgptr2 + in_j + dilation_w*2); + + vout0 = vout1 = vout2 = vout3 = vbias; + v_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3); + v_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3); + v_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3); + v_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3); + v_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3); + v_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3); + v_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3); + v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3); + v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3); + + vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult); + vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult); + vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult); + vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult); + + vout0 = v_min(v_max(vout0, outmin), outmax); + vout1 = v_min(v_max(vout1, outmin), outmax); + vout2 = v_min(v_max(vout2, outmin), outmax); + vout3 = v_min(v_max(vout3, outmin), outmax); + + v_store(outptr + out_j, vout0); + v_store(outptr + out_j + 4, vout1); + v_store(outptr + out_j + 8, vout2); + v_store(outptr + out_j + 12, vout3); + } + } + #endif + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 + + (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 + + (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy; + out1 = outZp + (int)std::round(out*mult); + outptr[out_j] = std::min(std::max(out1, -128), 127); + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + int s0 = 1, s1 = 1, s2 = 1; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0; + biasCopy += inpZp*(w00 + w10 + w20); + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0; + biasCopy += inpZp*(w01 + w11 + w21); + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0; + biasCopy += inpZp*(w02 + w12 + w22); + } + out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 + + (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 + + (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy; + out1 = outZp + (int)std::round(out*mult); + outptr[out_j] = std::min(std::max(out1, -128), 127); + } + } + } + continue; + } + // do im2row for a part of input tensor + int8_t* rowbuf = rowbuf0; + + if (isConv1D) + { + for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i ) + { + int delta = std::min(ofs1 - ofs, outW - out_j); + int out_j1 = out_j + delta; + + int in_j = out_j * stride_w - pad_l; + const int8_t* imgptr = data_inp0 + cn0*width + in_j; + ofs += delta; + + // do im2row for a part of input tensor + if( is1x1 ) + { + for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w ) + { + for( k = 0; k < vsz; k++ ) + rowbuf[k] = imgptr[k*inpPlaneSize]; + } + } + else + { + for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w ) + { + // this condition should be true for most of the tensor elements, i.e. + // most of the time the kernel aperture is inside the tensor X-Y plane. + if( out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w ) + { + for( k = 0; k < vsz; k++ ) + { + int k1 = ofstab[k]; + int8_t v0 = imgptr[k1]; + int8_t v1 = imgptr[k1 + stride_w]; + rowbuf[k] = v0; + rowbuf[k+vsz_a] = v1; + } + out_j++; + rowbuf += vsz_a; + imgptr += stride_w; + in_j += stride_w; + } + else + { + int i0 = std::max(0, (-in_j + dilation_w-1)/dilation_w); + int i1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w); + + // here some non-continuous sub-row of the row will not be + // filled from the tensor; we need to make sure that the uncovered + // elements are explicitly set to 0's. the easiest way is to + // set all the elements to 0's before the loop. + memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0])); + for( k = 0; k < ncn; k++ ) + { + for( i = i0; i < i1; i++ ) + { + int imgofs = k*width + i*dilation_w; + rowbuf[k*kernel_w + i] = imgptr[imgofs]; + } + } + } + } + } + } + } + else if (isConv2D) + { + if( is1x1 && stride_w == 1 && stride_h == 1 ) + { + const int8_t* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j; + for( int j = 0; j < bsz; j++, rowbuf += vsz_a ) + { + if( j + 4 <= bsz ) + { + k = 0; + for( ; k < vsz; k++ ) + { + const int8_t* inp = imgptr + j + k*inpPlaneSize; + int8_t v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3]; + rowbuf[k] = v0; + rowbuf[k + vsz_a] = v1; + rowbuf[k + vsz_a*2] = v2; + rowbuf[k + vsz_a*3] = v3; + } + j += 3; + rowbuf += vsz_a*3; + } + else + { + for( k = 0; k < vsz; k++ ) + { + rowbuf[k] = imgptr[j + k*inpPlaneSize]; + } + } + } + } + else + for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i ) + { + int delta = std::min(ofs1 - ofs, outW - out_j); + int out_j1 = out_j + delta; + + int in_i = out_i * stride_h - pad_t; + int in_j = out_j * stride_w - pad_l; + const int8_t* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j; + ofs += delta; + + // do im2row for a part of input tensor + if( is1x1 ) + { + for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w ) + { + for( k = 0; k < vsz; k++ ) + rowbuf[k] = imgptr[k*inpPlaneSize]; + } + } + else + { + bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h; + int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h); + int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h); + + for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w ) + { + // this condition should be true for most of the tensor elements, i.e. + // most of the time the kernel aperture is inside the tensor X-Y plane. + if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w ) + { + for( k = 0; k < vsz; k++ ) + { + int k1 = ofstab[k]; + int8_t v0 = imgptr[k1]; + int8_t v1 = imgptr[k1 + stride_w]; + rowbuf[k] = v0; + rowbuf[k+vsz_a] = v1; + } + out_j++; + rowbuf += vsz_a; + imgptr += stride_w; + in_j += stride_w; + } + else + { + int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w); + int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w); + + // here some non-continuous sub-row of the row will not be + // filled from the tensor; we need to make sure that the uncovered + // elements are explicitly set to 0's. the easiest way is to + // set all the elements to 0's before the loop. + memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0])); + for( k = 0; k < ncn; k++ ) + { + for( i = i0; i < i1; i++ ) + { + for( j = j0; j < j1; j++ ) + { + int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w; + rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs]; + } + } + } + } + } + } + } + } + else + { + for( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1) / outH, out_i = (out_i + 1) % outH, out_j = 0 ) + { + int delta = std::min(ofs1 - ofs, outW - out_j); + int out_j1 = out_j + delta; + + int in_d = out_d * stride_d - pad_d; + int in_i = out_i * stride_h - pad_t; + int in_j = out_j * stride_w - pad_l; + const int8_t* imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j; + ofs += delta; + + int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d); + int d1 = std::min(kernel_d, (depth - in_d + dilation_d - 1) / dilation_d); + + int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h); + int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h); + + for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w ) + { + int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w); + int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w); + + // here some non-continuous sub-row of the row will not be + // filled from the tensor; we need to make sure that the uncovered + // elements are explicitly set to 0's. the easiest way is to + // set all the elements to 0's before the loop. + memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0])); + for( k = 0; k < ncn; k++ ) + { + for ( d = d0; d < d1; d++) + { + for( i = i0; i < i1; i++ ) + { + for( j = j0; j < j1; j++ ) + { + int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w; + rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs]; + } + } + } + } + } + } + } + // now compute dot product of the weights + // and im2row-transformed part of the tensor + #if CV_TRY_AVX512_SKX + if(useAVX512) + opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); + else + #endif + #if CV_TRY_AVX2 + if(useAVX2) + opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); + else + #endif + for( int i = 0; i < outCn; i += 2 ) + { + const int8_t* wptr0 = wptr + i*wstep; + const int8_t* wptr1 = wptr0 + wstep; + int* outptr0 = data_out0 + ofs0 + i*outPlaneSize; + int* outptr1 = outptr0 + outPlaneSize; + int bias0 = biasptr[i], bias1 = biasptr[i+1]; + float mult0 = multptr[i], mult1 = multptr[i+1]; + + if( i+1 >= outCn ) + { + wptr1 = wptr0; + outptr1 = outptr0; + bias1 = bias0; + mult1 = mult0; + } + int j = 0; + #if CV_SIMD128 + v_int32x4 voutzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127); + v_float32x4 vmult0 = v_setall_f32(mult0), vmult1 = v_setall_f32(mult1); + for( ; j <= bsz - 4; j += 4 ) + { + const int8_t* rptr = rowbuf0 + j*vsz_a; + v_int32x4 s0, s1; + + if( cn0 == 0 ) + { + s0 = v_setall_s32(bias0); + s1 = v_setall_s32(bias1); + } + else + { + s0 = v_load(outptr0 + j); + s1 = v_load(outptr1 + j); + } + + v_int32x4 vs00 = v_setzero_s32(), vs01 = v_setzero_s32(), + vs02 = v_setzero_s32(), vs03 = v_setzero_s32(), + vs10 = v_setzero_s32(), vs11 = v_setzero_s32(), + vs12 = v_setzero_s32(), vs13 = v_setzero_s32(); + for( k = 0; k < vsz; k += 16, rptr += 16 ) + { + v_int8x16 w0 = v_load_aligned(wptr0 + k); + v_int8x16 w1 = v_load_aligned(wptr1 + k); + v_int8x16 r0 = v_load_aligned(rptr); + v_int8x16 r1 = v_load_aligned(rptr + vsz_a); + v_int8x16 r2 = v_load_aligned(rptr + vsz_a*2); + v_int8x16 r3 = v_load_aligned(rptr + vsz_a*3); + + vs00 = v_dotprod_expand_fast(w0, r0, vs00); + vs01 = v_dotprod_expand_fast(w0, r1, vs01); + vs02 = v_dotprod_expand_fast(w0, r2, vs02); + vs03 = v_dotprod_expand_fast(w0, r3, vs03); + + vs10 = v_dotprod_expand_fast(w1, r0, vs10); + vs11 = v_dotprod_expand_fast(w1, r1, vs11); + vs12 = v_dotprod_expand_fast(w1, r2, vs12); + vs13 = v_dotprod_expand_fast(w1, r3, vs13); + } + s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03)); + s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13)); + if( cn1 == inpCn ) + { + s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0); + s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1); + + s0 = v_min(v_max(s0, outmin), outmax); + s1 = v_min(v_max(s1, outmin), outmax); + } + v_store(outptr0 + j, s0); + v_store(outptr1 + j, s1); + } + #endif + for( ; j < bsz; j++ ) + { + const int8_t* rptr = rowbuf0 + j*vsz_a; + int s00, s10; + + if( cn0 == 0 ) + { + s00 = bias0; + s10 = bias1; + } + else + { + s00 = outptr0[j]; + s10 = outptr1[j]; + } + + for( k = 0; k < vsz; k++ ) + { + int8_t r0 = rptr[k]; + s00 += (int)wptr0[k] * r0; + s10 += (int)wptr1[k] * r0; + } + if( cn1 == inpCn ) + { + int out0 = outZp + (int)std::round(s00*mult0); + int out1 = outZp + (int)std::round(s10*mult1); + + s00 = std::min(std::max(out0, -128), 127); + s10 = std::min(std::max(out1, -128), 127); + } + + outptr0[j] = s00; + outptr1[j] = s10; + } + } + } + } + if( activ_ ) + activ_->forwardSlice(data_out0 + stripeStart, lutptr_, + data_out0 + stripeStart, (int)(stripeEnd - stripeStart), + outPlaneSize, startOutCn, startOutCn + outCn); + } + } + }; + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + +#if CV_SSE3 + uint32_t ftzMode = _MM_GET_FLUSH_ZERO_MODE(); + uint32_t dazMode = _MM_GET_DENORMALS_ZERO_MODE(); + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#endif + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + /*if (inputs[0].dims > 3) { + printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n", + name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3], + kernel.width, kernel.height, pad.width, pad.height, + stride.width, stride.height, dilation.width, dilation.height); + } + else { + printf("conv %s: input (%d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n", + name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], + kernel.width, kernel.height, pad.width, pad.height, + stride.width, stride.height, dilation.width, dilation.height); + }*/ + + int inpGroupCn = blobs[0].size[1]; + CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % inpGroupCn == 0, + outputs.size() == 1, inputs[0].data != outputs[0].data); + + int ngroups = inputs[0].size[1] / inpGroupCn; + CV_Assert(outputs[0].size[1] % ngroups == 0); + + int nstripes = std::max(getNumThreads(), 1); + Mat outputInt32 = Mat(shape(outputs[0]), CV_32S); + + ParallelConv::run(inputs[0], outputInt32, weightsMat, outputMultiplier, biasvec, activationLUT, kernel_size, strides, + pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes, input_zp, output_zp); + + outputInt32.convertTo(outputs[0], CV_8S); + +#if CV_SSE3 + _MM_SET_FLUSH_ZERO_MODE(ftzMode); + _MM_SET_DENORMALS_ZERO_MODE(dazMode); +#endif + } + + virtual int64 getFLOPS(const std::vector &inputs, + const std::vector &outputs) const CV_OVERRIDE + { + CV_Assert(inputs.size() == outputs.size()); + + int64 flops = 0; + int karea = std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies()); + for (int i = 0; i < outputs.size(); i++) + { + flops += total(outputs[i])*(CV_BIG_INT(2)*karea*inputs[i][1] + 1); + } + return flops; + } +}; + +Ptr ConvolutionLayerInt8::create(const LayerParams ¶ms) +{ + return Ptr(new ConvolutionLayerInt8Impl(params)); +} + +} +} diff --git a/modules/dnn/src/int8layers/elementwise_layers.cpp b/modules/dnn/src/int8layers/elementwise_layers.cpp new file mode 100644 index 0000000000..75118b6bc1 --- /dev/null +++ b/modules/dnn/src/int8layers/elementwise_layers.cpp @@ -0,0 +1,190 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" + +#include +#include + +namespace cv +{ +namespace dnn +{ + +class ActivationLayerInt8Impl CV_FINAL : public ActivationLayerInt8 +{ +public: + ActivationLayerInt8Impl(const LayerParams ¶ms) + { + setParamsFrom(params); + activationLUT = !blobs.empty() ? blobs[0] : Mat(); + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV; + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); + return true; + } + + class Activation : public cv::ParallelLoopBody + { + public: + const Mat* src; + const Mat* lut; + Mat* dst; + int nstripes; + + Activation() : src(0), lut(0), dst(0), nstripes(0){} + + static void run(const Mat& src, const Mat& lut, Mat& dst, int nstripes) + { + Activation p; + + p.src = &src; + p.lut = &lut; + p.dst = &dst; + p.nstripes = nstripes; + + parallel_for_(Range(0, nstripes), p, nstripes); + } + + void operator()(const Range &r) const CV_OVERRIDE + { + const int8_t* table = lut->ptr(); + int nsamples = 1, outCn = 1; + size_t planeSize = 1; + + if (src->dims > 1) + { + nsamples = src->size[0]; + outCn = src->size[1]; + } + else + outCn = src->size[0]; + + for (int i = 2; i < src->dims; ++i) + planeSize *= src->size[i]; + + size_t stripeSize = (planeSize + nstripes - 1)/nstripes; + size_t stripeStart = r.start*stripeSize; + size_t stripeEnd = std::min(r.end*stripeSize, planeSize); + int len = (int)(stripeEnd - stripeStart); + + for( int i = 0; i < nsamples; i++ ) + { + const int8_t* srcptr = src->ptr(i) + stripeStart; + int8_t* dstptr = dst->ptr(i) + stripeStart; + for( int cn = 0; cn < outCn; cn++, srcptr += planeSize, dstptr += planeSize ) + { + int i = 0; +#if CV_SIMD128 + for( ; i <= len - 16; i += 16 ) + { + v_int8x16 out(table[srcptr[i] + 128], table[srcptr[i+1] + 128], table[srcptr[i+2] + 128], table[srcptr[i+3] + 128], + table[srcptr[i+4] + 128], table[srcptr[i+5] + 128], table[srcptr[i+6] + 128], table[srcptr[i+7] + 128], + table[srcptr[i+8] + 128], table[srcptr[i+9] + 128], table[srcptr[i+10] + 128], table[srcptr[i+11] + 128], + table[srcptr[i+12] + 128], table[srcptr[i+13] + 128], table[srcptr[i+14] + 128], table[srcptr[i+15] + 128]); + v_store(dstptr + i, out); + } +#endif + for( ; i < len; i++ ) + { + dstptr[i] = table[srcptr[i] + 128]; + } + } + } + } + }; + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + for (size_t i = 0; i < inputs.size(); i++) + { + const Mat &src = inputs[i]; + if (!activationLUT.empty()) + { + const int nstripes = getNumThreads(); + Mat &dst = outputs[i]; + CV_Assert(src.size == dst.size && src.type() == dst.type() && + src.isContinuous() && dst.isContinuous() && src.type() == CV_8S); + + Activation::run(src, activationLUT, dst, nstripes); + } + else + { + src.copyTo(outputs[i]); + } + } + } + + void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE + { + for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize ) + { + int i = 0; +#if CV_SIMD128 + for( ; i <= len - 16; i += 16 ) + { + v_int8x16 out(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128], + lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128], + lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128], + lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]); + v_store(dst + i, out); + } +#endif + for( ; i < len; i++ ) + dst[i] = lut[src[i] + 128]; + } + } + + void forwardSlice(const int* src, const int* lut, int* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE + { + for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize ) + { + int i = 0; +#if CV_SIMD128 + for( ; i <= len - 16; i += 16 ) + { + v_int32x4 out0(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128]); + v_int32x4 out1(lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128]); + v_int32x4 out2(lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128]); + v_int32x4 out3(lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]); + + v_store(dst + i, out0); + v_store(dst + i + 4, out1); + v_store(dst + i + 8, out2); + v_store(dst + i + 12, out3); + } +#endif + for( ; i < len; i++ ) + dst[i] = lut[src[i] + 128]; + } + + } + + Mat activationLUT; +}; + +Ptr ActivationLayerInt8::create(const LayerParams& params) +{ + return Ptr(new ActivationLayerInt8Impl(params)); +} + +} +} diff --git a/modules/dnn/src/int8layers/eltwise_layer.cpp b/modules/dnn/src/int8layers/eltwise_layer.cpp new file mode 100644 index 0000000000..be7a32b1ef --- /dev/null +++ b/modules/dnn/src/int8layers/eltwise_layer.cpp @@ -0,0 +1,577 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" +#include + +namespace cv +{ +namespace dnn +{ + +class EltwiseLayerInt8Impl CV_FINAL : public EltwiseLayerInt8 +{ +public: + enum EltwiseOp + { + PROD = 0, + SUM = 1, + MAX = 2 + } op; + std::vector coeffs; + std::vector zeropoints; + + enum OutputChannelsMode + { + ELTWISE_CHANNNELS_SAME = 0, //!< number of channels from inputs must be the same and equal to output's number of channels + ELTWISE_CHANNNELS_INPUT_0, //!< number of channels from inputs may be different, + //!< output's number of channels is equal to number of channels of first input + //!< number of channels of other inputs should not be greater than number of channels of first input + ELTWISE_CHANNNELS_INPUT_0_TRUNCATE, //!< number of channels from inputs may be different, + //!< output's number of channels is equal to number of channels of first input + //!< there is restriction on number of channels of other inputs + //!< extra channels of other inputs is ignored + ELTWISE_CHANNNELS_USE_MAX, //!< number of channels from inputs may be different, + //!< output's number of channels is equal to maximal number of input channels + //!< @note supported operation: `SUM` + } channelsModeInput; + + + mutable OutputChannelsMode channelsMode; //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal) + mutable /*size_t*/int outputChannels; + + EltwiseLayerInt8Impl(const LayerParams& params) + : outputChannels(0) + { + setParamsFrom(params); + offset = params.get("offset", 0.f); + hasVecInput = false; + op = SUM; + if (params.has("operation")) + { + String operation = toLowerCase(params.get("operation")); + if (operation == "prod") + op = PROD; + else if (operation == "sum") + op = SUM; + else if (operation == "max") + op = MAX; + else + CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\""); + } + + if (params.has("coeff")) + { + DictValue paramCoeff = params.get("coeff"); + int i, n = paramCoeff.size(); + coeffs.resize(n); + for (i = 0; i < n; i++) + { + coeffs[i] = paramCoeff.get(i); + } + } + + if (params.has("input_zeropoints")) + { + DictValue zp = params.get("input_zeropoints"); + int i, n = zp.size(); + zeropoints.resize(n); + for (i = 0; i < n; i++) + { + zeropoints[i] = zp.get(i); + } + } + + channelsModeInput = ELTWISE_CHANNNELS_SAME; + if (params.has("output_channels_mode")) + { + String v = toLowerCase(params.get("output_channels_mode")); + if (v == "same") + { + channelsModeInput = ELTWISE_CHANNNELS_SAME; + } + else if (v == "input_0") + { + channelsModeInput = ELTWISE_CHANNNELS_INPUT_0; + } + else if (v == "input_0_truncate") + { + channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE; + } + else if (v == "max_input_channels") + { + channelsModeInput = ELTWISE_CHANNNELS_USE_MAX; + if (op != SUM) + CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only"); + } + else + CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\""); + } + channelsMode = channelsModeInput; + + // TODO Must have checks for other unknown options + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV; + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + CV_Assert(inputs.size() >= 2); + CV_Assert(inputs[0].size() >= 2); + CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size()); + CV_Assert(op == SUM || op == PROD || coeffs.size() == 0); + + int dims = inputs[0].size(); + // Number of channels in output shape is determined by the first input tensor. + bool variableChannels = false; + int numChannels = inputs[0][1]; + for (size_t i = 1; i < inputs.size(); i++) + { + CV_Assert(inputs[0][0] == inputs[i][0]); // batch sizes are equal + + int input_channels = inputs[i][1]; + if (numChannels != input_channels) + variableChannels = true; + + if (channelsModeInput == ELTWISE_CHANNNELS_SAME) + { + CV_Assert(numChannels == input_channels); + } + else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0) + { + CV_Assert(numChannels >= input_channels); + } + else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE) + { + // nothing to check + } + else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX) + { + numChannels = std::max(numChannels, input_channels); + } + else + { + CV_Assert(0 && "Internal error"); + } + } + + channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME; + outputChannels = numChannels; + + outputs.assign(1, inputs[0]); + outputs[0][1] = numChannels; + + if (dims > 2) + { + size_t vecIdx = 0; + bool isVecFound = false; + for (size_t i = 0; i < inputs.size(); i++) + { + bool allOnes = isAllOnes(inputs[i], 2, dims); + if (!allOnes && !isVecFound) + { + vecIdx = i; + isVecFound = true; + } + + if (!allOnes && i != vecIdx) + { + for (size_t j = 2; j < dims; j++) + { + CV_Assert(inputs[vecIdx][j] == inputs[i][j]); + } + } + } + + if (channelsModeInput == ELTWISE_CHANNNELS_SAME && isVecFound) + { + for (size_t j = 2; j < dims; j++) + { + outputs[0][j] = inputs[vecIdx][j]; + } + } + } + + return false; + } + + void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE + { + std::vector inputs; + inputs_arr.getMatVector(inputs); + + for (size_t i = 0; i < inputs.size(); i++) + { + MatShape inpShape = shape(inputs[i].size); + if (isAllOnes(inpShape, 2, inputs[i].dims)) + { + hasVecInput = true; + return; + } + } + } + + class EltwiseInvoker : public ParallelLoopBody + { + EltwiseLayerInt8Impl& self; + std::vector srcs; + std::vector srcNumChannels; + int nsrcs; + Mat* dst; + Mat* buf; + std::vector coeffs; + std::vector zeropoints; + int nstripes; + const Mat* activLUT; + const ActivationLayerInt8* activ; + int channels; + size_t planeSize; + float offset; + + EltwiseInvoker(EltwiseLayerInt8Impl& self_) + : self(self_) + , nsrcs(0), dst(0), buf(0), nstripes(0), activ(0), channels(0) + , planeSize(0), offset(0) + {} + + public: + static void run(EltwiseLayerInt8Impl& self, + const Mat* srcs, int nsrcs, Mat& buf, Mat& dst, + int nstripes, float offset) + { + const EltwiseOp op = self.op; + CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_8SC1, ""); CV_Assert(dst.isContinuous()); + CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs); + CV_CheckGE(nsrcs, 2, ""); + + CV_Assert(self.outputChannels == dst.size[1]); + + EltwiseInvoker p(self); + p.srcs.resize(nsrcs); + p.srcNumChannels.resize(nsrcs); + p.coeffs = self.coeffs; // can be sorted + p.zeropoints = self.zeropoints; + + bool sortInputs = false; + for( int i = 0; i < nsrcs; i++ ) + { + p.srcs[i] = &srcs[i]; + CV_CheckEQ(srcs[i].dims, dst.dims, ""); + CV_Assert(srcs[i].isContinuous()); + CV_Assert(srcs[i].type() == dst.type()); + p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1; + + if (self.channelsMode == ELTWISE_CHANNNELS_SAME) + { + CV_Assert(srcs[i].size == dst.size); + } + else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0) + { + if (i == 0) + CV_Assert(srcs[0].size == dst.size); + CV_Assert(self.outputChannels >= p.srcNumChannels[i]); + sortInputs = true; + } + else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE) + { + if (i == 0) + CV_Assert(srcs[0].size == dst.size); + sortInputs = true; + } + else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX) + { + CV_Assert(op == SUM); + CV_Assert(self.outputChannels >= p.srcNumChannels[i]); + sortInputs = true; + } + else + { + CV_Assert(0 && "Internal error"); + } + + if (sortInputs) + { + // Sort srcs and coefficients in the desc order by number of channels + for (int j = i; j >= 1; j--) + { + if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1])) + { + std::swap(p.srcs[j - 1], p.srcs[j]); + std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]); + if (!p.coeffs.empty()) + std::swap(p.coeffs[j - 1], p.coeffs[j]); + if (!p.zeropoints.empty()) + std::swap(p.zeropoints[j - 1], p.zeropoints[j]); + } + else + break; + } + } + } + + p.nsrcs = nsrcs; + p.dst = &dst; + p.buf = &buf; + p.nstripes = nstripes; + p.offset = offset; + p.channels = (dst.dims >= 4 ? dst.size[1] : 1); + + p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1); + CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, ""); + p.activLUT = &self.activationLUT; + p.activ = !self.activationLUT.empty() ? self.activ.get() : 0; + + parallel_for_(Range(0, nstripes), p, nstripes); + } + + void operator()(const Range& r) const CV_OVERRIDE + { + const EltwiseOp op = self.op; + size_t total = dst->size[0]*planeSize; + size_t stripeSize = (total + nstripes - 1)/nstripes; + size_t stripeStart = r.start*stripeSize; + size_t stripeEnd = std::min(r.end*stripeSize, total); + const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0; + const int* zeropointsptr = !zeropoints.empty() ? &zeropoints[0] : 0; + const int8_t* lutptr = !activLUT->empty() ? activLUT->ptr() : 0; + int8_t* dstptr0 = dst->ptr(); + float* bufptr0 = buf->ptr(); + int blockSize0 = 1 << 12; + + for (size_t ofs = stripeStart; ofs < stripeEnd; ) + { + int sampleIdx = (int)(ofs / planeSize); + int delta = (int)ofs - sampleIdx * planeSize; + int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta)); + if( blockSize <= 0 ) + break; + ofs += blockSize; + + for (int c = 0; c < channels; c++) + { + size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize; + int8_t* dstptr = dstptr0 + dstIdx; + float* bufptr = bufptr0 + dstIdx; + + // process first two inputs + { + const int8_t* srcptr0 = srcs[0]->ptr() + dstIdx; + + const int inputIdx = 1; + int src1_channels = srcNumChannels[inputIdx]; + if (c >= src1_channels) + { + // no data from second input + if (!coeffsptr) + { + for (int j = 0; j < blockSize; j++) + { + dstptr[j] = srcptr0[j]; + } + } + else + { + float c0 = coeffsptr[0]; + int z0 = op == PROD ? zeropointsptr[0] : 0; + for (int j = 0; j < blockSize; j++) + { + bufptr[j] = c0 * (srcptr0[j] - z0); + } + } + } + else + { + size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize; + const int8_t* srcptrI = srcs[inputIdx]->ptr() + srcIdx; + + if (op == PROD) + { + float c0 = coeffsptr[0]; + float c1 = coeffsptr[1]; + int z0 = zeropointsptr[0]; + int z1 = zeropointsptr[1]; + for (int j = 0; j < blockSize; j++) + { + bufptr[j] = (c0*(srcptr0[j] - z0)) * (c1*(srcptrI[j] - z1)); + } + } + else if (op == MAX) + { + for (int j = 0; j < blockSize; j++) + { + dstptr[j] = std::max(srcptr0[j], srcptrI[j]); + } + } + else if (op == SUM) + { + float c0 = coeffsptr[0]; + float c1 = coeffsptr[1]; + for (int j = 0; j < blockSize; j++) + { + bufptr[j] = c0*srcptr0[j] + c1*srcptrI[j]; + } + } + else + CV_Error(Error::StsInternal, ""); + } + } + + // aggregate other inputs (3+) + for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++) + { + int srcI_channels = srcNumChannels[inputIdx]; + if (c >= srcI_channels) + continue; // no data from second input + size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize; + const int8_t* srcptrI = srcs[inputIdx]->ptr() + srcIdx; + + if (op == PROD) + { + float cI = coeffsptr[inputIdx]; + int zI = zeropointsptr[inputIdx]; + for (int j = 0; j < blockSize; j++) + { + bufptr[j] *= cI*(srcptrI[j] - zI); + } + } + else if (op == MAX) + { + for (int j = 0; j < blockSize; j++) + { + dstptr[j] = std::max(dstptr[j], srcptrI[j]); + } + } + else if (op == SUM) + { + float cI = coeffsptr[inputIdx]; + for (int j = 0; j < blockSize; j++) + { + bufptr[j] += cI * srcptrI[j]; + } + } + else + CV_Error(Error::StsInternal, ""); + } + + // add offset and saturate cast to int8 + if (op == SUM || op == PROD) + { + for (int j = 0; j < blockSize; j++) + { + dstptr[j] = saturate_cast(std::round(bufptr[j] + offset)); + } + } + } + if( activ ) + { + int8_t* ptr = dstptr0 + delta + sampleIdx*channels*planeSize; + activ->forwardSlice(ptr, lutptr, ptr, blockSize, planeSize, 0, channels); + } + } + } + }; + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + CV_Assert(outputs.size() == 1); + const int nstripes = getNumThreads(); + + if (channelsModeInput == ELTWISE_CHANNNELS_SAME && inputs[0].dims > 2) + { + for (size_t i = 0; i < inputs.size(); i++) + { + MatShape inpShape = shape(inputs[i].size); + bool allOnes = isAllOnes(inpShape, 2, inputs[i].dims); + + if (allOnes) + { + Mat tmpInput = inputs[i]; + MatShape outShape = shape(outputs[0].size); + size_t xSize = outShape[2]; + for (size_t j = 3; j < outShape.size(); j++) + xSize *= outShape[j]; + + int dimVec[3] = {outShape[0], outShape[1], (int) xSize}; + std::vector matSizesVec(&dimVec[0], &dimVec[0] + 3); + inputs[i] = Mat(matSizesVec, tmpInput.type()); + + std::vector idx(outShape.size(), 0); + std::vector outIdx(inpShape.size(), 0); + + for (size_t j = 0; j < outShape[0]; j++) + { + outIdx[0] = idx[0] = j; + for(size_t k = 0; k < outShape[1]; k++) + { + outIdx[1] = idx[1] = k; + for (size_t x = 0; x < xSize; x++) + { + outIdx[2] = x; + inputs[i].at(outIdx.data()) = tmpInput.at(idx.data()); + } + } + } + inputs[i] = inputs[i].reshape(0, outShape); + } + } + } + + Mat buf = Mat(shape(outputs[0]), CV_32F); // to store intermediate results + EltwiseInvoker::run(*this, &inputs[0], (int)inputs.size(), buf, outputs[0], nstripes, offset); + } + + virtual int64 getFLOPS(const std::vector &inputs, + const std::vector &outputs) const CV_OVERRIDE + { + CV_UNUSED(outputs); // suppress unused variable warning + CV_Assert(inputs.size()); + + // FIXIT: handle inputs with different number of channels + long flops = inputs.size() * total(inputs[0]); + + return flops; + } + + bool setActivation(const Ptr& layer) CV_OVERRIDE + { + Ptr activ_int8 = layer.dynamicCast(); + if (!activ_int8.empty()) + { + activ = activ_int8; + if (!activ_int8->blobs.empty()) + activationLUT = activ_int8->blobs[0]; + return true; + } + return false; + } + + Mat activationLUT; + Ptr activ; + +private: + bool hasVecInput; + float offset; +}; + +Ptr EltwiseLayerInt8::create(const LayerParams& params) +{ + return Ptr(new EltwiseLayerInt8Impl(params)); +} + +} +} diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp new file mode 100644 index 0000000000..83da677a47 --- /dev/null +++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp @@ -0,0 +1,266 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" + +#include + +namespace cv +{ +namespace dnn +{ + +class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8 +{ +public: + enum { VEC_ALIGN = 32 }; + FullyConnectedLayerInt8Impl(const LayerParams& params) + { + setParamsFrom(params); + output_zp = params.get("zeropoints"); + axis = params.get("axis", 1); + if (blobs.size() == 3) + { + // blobs[0] - Weights + // blobs[1] - Bias fused with offset + // blobs[2] - Multipliers for output stage + int numOutput = params.get("num_output"); + int innerSize = (int)blobs[0].total() / numOutput; + + CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total()); + CV_Assert((size_t)numOutput == blobs[1].total()); + + weightsMat = blobs[0] = blobs[0].reshape(1, numOutput); + int vecsize = weightsMat.cols; + if (vecsize % VEC_ALIGN != 0) + { + int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN); + Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type()); + Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned); + wpadding.setTo(Scalar::all(0)); + weightsMat = weightsBuf.colRange(0, vecsize); + blobs[0].copyTo(weightsMat); + } + biasMat = blobs[1] = blobs[1].reshape(1, 1); + outputMultiplier = blobs[2]; + } + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &) const CV_OVERRIDE + { + int numOutput, cAxis; + CV_CheckEQ(inputs.size(), (size_t)1, ""); + CV_CheckEQ(blobs[0].dims, 2, ""); + numOutput = blobs[0].size[0]; + CV_Assert((size_t)numOutput == blobs[1].total()); + cAxis = normalize_axis(axis, inputs[0]); + + MatShape outShape(cAxis + 1); + for (int i = 0; i < cAxis; ++i) + outShape[i] = inputs[0][i]; + outShape.back() = numOutput; + + outputs.resize(1, outShape); + return false; + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV; + } + + virtual bool setActivation(const Ptr& layer) CV_OVERRIDE + { + Ptr activ_int8 = layer.dynamicCast(); + if (!activ_int8.empty()) + { + activ = activ_int8; + if (!activ_int8->blobs.empty()) + activ_int8->blobs[0].convertTo(activationLUT, CV_32S); + return true; + } + return false; + } + + class FullyConnected : public ParallelLoopBody + { + public: + FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0), + dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {} + + static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier, + const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp) + { + CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols && + dstMat.rows == srcMat.rows && dstMat.cols == weights.rows && + srcMat.type() == weights.type() && srcMat.type() == CV_8S && + dstMat.type() == CV_32S && biasMat.type() == CV_32S && + biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols ); + + FullyConnected p; + + p.srcMat = &srcMat; + p.weights = &weights; + p.biasMat = &biasMat; + p.outputMultiplier = &outputMultiplier; + p.activationLUT = &activationLUT; + p.dstMat = &dstMat; + p.nstripes = nstripes; + p.outZp = outZp; + p.activ = !activationLUT.empty() ? activ : 0; + p.useAVX2 = checkHardwareSupport(CPU_AVX2); + p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; + + parallel_for_(Range(0, nstripes), p, nstripes); + } + + void operator()(const Range& r) const CV_OVERRIDE + { + int valign = FullyConnectedLayerInt8Impl::VEC_ALIGN; + int nsamples = srcMat->rows; + int nw0 = weights->rows; + int k, vecsize = srcMat->cols; + int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN); + size_t total = (size_t)nsamples*nw0; + size_t stripeSize = (total + nstripes - 1)/nstripes; + size_t stripeStart = r.start*stripeSize; + size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total); + size_t wstep = weights->step1(); + AutoBuffer srcbuf(vecsize_aligned + valign); + int8_t* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(int8_t))); + const int* lutptr = !activationLUT->empty() ? activationLUT->ptr() : 0; + + for( k = vecsize; k < vecsize_aligned; k++ ) + sptr[k] = 0; + + for( size_t ofs = stripeStart; ofs < stripeEnd; ) + { + int sampleIdx = (int)(ofs / nw0); + int delta = (int)(ofs - (size_t)sampleIdx*nw0); + const int8_t* sptr_ = srcMat->ptr(sampleIdx); + const int8_t* wptr = weights->ptr(delta); + int* dptr = dstMat->ptr(sampleIdx) + delta; + const int* biasptr = biasMat->ptr() + delta; + const float* multptr = outputMultiplier->ptr() + delta; + int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs)); + + memcpy(sptr, sptr_, vecsize*sizeof(sptr[0])); + #if CV_TRY_AVX512_SKX + if( useAVX512 ) + opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); + else + #endif + #if CV_TRY_AVX2 + if( useAVX2 ) + opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); + else + #endif + { + int i = 0; + #if CV_SIMD + for( ; i <= nw - 4; i += 4, wptr += 4*wstep ) + { + v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(), + vs2 = v_setzero_s32(), vs3 = v_setzero_s32(); + v_int32x4 outzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127); + v_int32x4 s = v_load(biasptr + i); + v_float32x4 mult = v_load(multptr + i); + + for( k = 0; k < vecsize; k += 16 ) + { + v_int8x16 v = v_load_aligned(sptr + k); + vs0 = v_dotprod_expand_fast(v, v_load_aligned(wptr + k), vs0); + vs1 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep + k), vs1); + vs2 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*2 + k), vs2); + vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3); + } + + s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3)); + v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult); + v_store(dptr + i, v_min(v_max(out, outmin), outmax)); + } + #endif + + for( ; i < nw; i++, wptr += wstep ) + { + int s0 = biasptr[i]; + float mult0 = multptr[i]; + + for( k = 0; k < vecsize; k++ ) + { + int8_t v = sptr[k]; + s0 += (int)v*wptr[k]; + } + int out0 = outZp + (int)std::round(s0*mult0); + dptr[i] = std::min(std::max(out0, -128), 127); + } + } + + if(activ) + activ->forwardSlice(dptr, lutptr, dptr, 1, 1, delta, delta + nw); + + ofs += nw; + } + } + + const Mat *srcMat, *weights, *biasMat, *outputMultiplier, *activationLUT; + const ActivationLayerInt8* activ; + Mat* dstMat; + int nstripes, outZp; + bool useAVX2; + bool useAVX512; + }; + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + std::vector input, output; + inputs_arr.getMatVector(input); + outputs_arr.getMatVector(output); + + int axisCan = normalize_axis(axis, input[0].dims); + int outerSize = input[0].total(0, axisCan); + Mat srcMat = input[0].reshape(1, outerSize); + + Mat dstMat = output[0].reshape(1, outerSize); + Mat dstMatInt32= Mat(shape(dstMat), CV_32S); + + const int nstripes = getNumThreads(); + FullyConnected::run(srcMat, weightsMat, biasMat, outputMultiplier, activationLUT, dstMatInt32, activ.get(), nstripes, output_zp); + dstMatInt32.convertTo(dstMat, CV_8S); + } + + virtual int64 getFLOPS(const std::vector &inputs, + const std::vector &outputs) const CV_OVERRIDE + { + CV_UNUSED(inputs); // suppress unused variable warning + long flops = 0; + + int innerSize = blobs[0].size[1]; + for(int i = 0; i < outputs.size(); i++) + { + flops += CV_BIG_INT(3)*innerSize*total(outputs[i]); + } + + return flops; + + } + + Mat weightsMat, biasMat, outputMultiplier, activationLUT; + Ptr activ; +}; + +Ptr InnerProductLayerInt8::create(const LayerParams& params) +{ + return Ptr(new FullyConnectedLayerInt8Impl(params)); +} + +} +} diff --git a/modules/dnn/src/int8layers/layers_common.hpp b/modules/dnn/src/int8layers/layers_common.hpp new file mode 100644 index 0000000000..cb185a9eda --- /dev/null +++ b/modules/dnn/src/int8layers/layers_common.hpp @@ -0,0 +1,41 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__ +#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__ +#include +#include + +#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +// dispatched AVX/AVX2 optimizations +#include "./layers_common.simd.hpp" +#include "int8layers/layers_common.simd_declarations.hpp" +#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#ifdef HAVE_OPENCL +#include "../ocl4dnn/include/ocl4dnn.hpp" +#endif + +namespace cv +{ +namespace dnn +{ +void getConvolutionKernelParams(const LayerParams ¶ms, std::vector& kernel, std::vector& pads_begin, + std::vector& pads_end, std::vector& strides, std::vector& dilations, + cv::String &padMode, std::vector& adjust_pads); + +void getPoolingKernelParams(const LayerParams ¶ms, std::vector& kernel, std::vector& globalPooling, + std::vector& pads_begin, std::vector& pads_end, std::vector& strides, cv::String &padMode); + +void getConvPoolOutParams(const std::vector& inp, const std::vector& kernel, + const std::vector& stride, const String &padMode, + const std::vector& dilation, std::vector& out); + + void getConvPoolPaddings(const std::vector& inp, const std::vector& kernel, + const std::vector& strides, const String &padMode, + std::vector& pads_begin, std::vector& pads_end); +} +} + +#endif diff --git a/modules/dnn/src/int8layers/layers_common.simd.hpp b/modules/dnn/src/int8layers/layers_common.simd.hpp new file mode 100644 index 0000000000..bf6149e5c9 --- /dev/null +++ b/modules/dnn/src/int8layers/layers_common.simd.hpp @@ -0,0 +1,637 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "opencv2/core/hal/intrin.hpp" + +namespace cv { +namespace dnn { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +void fastConv( const int8_t* weights, size_t wstep, const int* bias, + const int8_t* rowbuf, int* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, int outZp, + const float* multiplier, bool initOutput, bool finalOutput ); +void fastDepthwiseConv( const int8_t* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const int* biasptr, const float* multptr, + const int8_t* inptr_, + int height, int width, + int* outptr_, + int out_d, int outH, int outW, + int inpZp, int outZp ); +void fastGEMM1T( const int8_t* vec, const int8_t* weights, + size_t wstep, const int* bias, const float* multiplier, + int* dst, int nvecs, int vecsize, int outZp ); + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX2 +#define OPENCV_FMADD_EPI8(_Tpvec, func) \ + inline _Tpvec _##func##_fmaddepi8_epi32(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ + { \ + _Tpvec even_a = _##func##_srai_epi16(_##func##_bslli_epi128(a, 1), 8); \ + _Tpvec odd_a = _##func##_srai_epi16(a, 8); \ + \ + _Tpvec even_b = _##func##_srai_epi16(_##func##_bslli_epi128(b, 1), 8); \ + _Tpvec odd_b = _##func##_srai_epi16(b, 8); \ + \ + _Tpvec prod0 = _##func##_madd_epi16(even_a, even_b); \ + _Tpvec prod1 = _##func##_madd_epi16(odd_a, odd_b); \ + return _##func##_add_epi32(_##func##_add_epi32(prod0, prod1), c); \ + } +OPENCV_FMADD_EPI8(__m256i, mm256) +//OPENCV_FMADD_EPI8(__m512i, mm512) + +enum { FASCONV_BASE_VECSZ = 4 }; + +void fastConv( const int8_t* weights, size_t wstep, const int* bias, + const int8_t* rowbuf, int* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, int outZp, + const float* multiplier, bool initOutput, bool finalOutput ) +{ + int outCn = outShape[1]; + size_t outPlaneSize = outShape[2]*outShape[3]; + int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0}; + int rsz = blockSize % FASCONV_BASE_VECSZ; + for( int i = 0; i < rsz; i++ ) + maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1; + __m128 mask = _mm_loadu_ps((const float*)maskbuf); + + // now compute dot product of the weights + // and im2row-transformed part of the tensor + for( int i = 0; i < outCn; i += 3 ) + { + const int8_t* wptr0 = weights + i*wstep; + const int8_t* wptr1 = wptr0 + wstep; + const int8_t* wptr2 = wptr1 + wstep; + int* outptr0 = output + i*outPlaneSize; + int* outptr1 = outptr0 + outPlaneSize; + int* outptr2 = outptr1 + outPlaneSize; + int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; + float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2]; + + if( i+2 >= outCn ) + { + wptr2 = wptr1; + outptr2 = outptr1; + bias2 = bias1; + mult2 = mult1; + + if( i+1 >= outCn ) + { + wptr2 = wptr1 = wptr0; + outptr2 = outptr1 = outptr0; + bias2 = bias1 = bias0; + mult2 = mult1 = mult0; + } + } + int j = 0; + for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) + { + bool tail = false; + if (j + FASCONV_BASE_VECSZ > blockSize) + { + if (j == 0) + break; + j = blockSize - FASCONV_BASE_VECSZ; + tail = true; + } + int k = 0; + const int8_t* rptr = rowbuf + j*vecsize_aligned; + + __m256i vs00 = _mm256_setzero_si256(), vs01 = _mm256_setzero_si256(), + vs02 = _mm256_setzero_si256(), vs03 = _mm256_setzero_si256(), + vs10 = _mm256_setzero_si256(), vs11 = _mm256_setzero_si256(), + vs12 = _mm256_setzero_si256(), vs13 = _mm256_setzero_si256(), + vs20 = _mm256_setzero_si256(), vs21 = _mm256_setzero_si256(), + vs22 = _mm256_setzero_si256(), vs23 = _mm256_setzero_si256(); + + /* TODO : Fix AVX-512 path. Segmentation fault in Conv2D Tests. +#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling + if (vecsize >= 64) + { + __m512i vs00_5 = _mm512_setzero_si512(), vs01_5 = _mm512_setzero_si512(), + vs02_5 = _mm512_setzero_si512(), vs03_5 = _mm512_setzero_si512(), + vs10_5 = _mm512_setzero_si512(), vs11_5 = _mm512_setzero_si512(), + vs12_5 = _mm512_setzero_si512(), vs13_5 = _mm512_setzero_si512(), + vs20_5 = _mm512_setzero_si512(), vs21_5 = _mm512_setzero_si512(), + vs22_5 = _mm512_setzero_si512(), vs23_5 = _mm512_setzero_si512(); + + for (; k <= vecsize - 64; k += 64, rptr += 64) + { + __m512i w0 = _mm512_load_si512(wptr0 + k); + __m512i w1 = _mm512_load_si512(wptr1 + k); + __m512i w2 = _mm512_load_si512(wptr2 + k); + __m512i r0 = _mm512_load_si512(rptr); + + vs00_5 = _mm512_fmaddepi8_epi32(w0, r0, vs00_5); + vs10_5 = _mm512_fmaddepi8_epi32(w1, r0, vs10_5); + vs20_5 = _mm512_fmaddepi8_epi32(w2, r0, vs20_5); + + r0 = _mm512_load_si512(rptr + vecsize_aligned); + vs01_5 = _mm512_fmaddepi8_epi32(w0, r0, vs01_5); + vs11_5 = _mm512_fmaddepi8_epi32(w1, r0, vs11_5); + vs21_5 = _mm512_fmaddepi8_epi32(w2, r0, vs21_5); + + r0 = _mm512_load_si512(rptr + vecsize_aligned*2); + vs02_5 = _mm512_fmaddepi8_epi32(w0, r0, vs02_5); + vs12_5 = _mm512_fmaddepi8_epi32(w1, r0, vs12_5); + vs22_5 = _mm512_fmaddepi8_epi32(w2, r0, vs22_5); + + r0 = _mm512_load_si512(rptr + vecsize_aligned*3); + vs03_5 = _mm512_fmaddepi8_epi32(w0, r0, vs03_5); + vs13_5 = _mm512_fmaddepi8_epi32(w1, r0, vs13_5); + vs23_5 = _mm512_fmaddepi8_epi32(w2, r0, vs23_5); + } + + // now fold the 512 bit accumulator vectors into 256 bit vectors so that the AVX2 code can finish + // the tail of the vector + + vs00 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs00_5, 0), _mm512_extracti32x8_epi32(vs00_5, 1)); + vs10 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs10_5, 0), _mm512_extracti32x8_epi32(vs10_5, 1)); + vs20 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs20_5, 0), _mm512_extracti32x8_epi32(vs20_5, 1)); + + vs01 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs01_5, 0), _mm512_extracti32x8_epi32(vs01_5, 1)); + vs11 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs11_5, 0), _mm512_extracti32x8_epi32(vs11_5, 1)); + vs21 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs21_5, 0), _mm512_extracti32x8_epi32(vs21_5, 1)); + + vs02 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs02_5, 0), _mm512_extracti32x8_epi32(vs02_5, 1)); + vs12 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs12_5, 0), _mm512_extracti32x8_epi32(vs12_5, 1)); + vs22 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs22_5, 0), _mm512_extracti32x8_epi32(vs22_5, 1)); + + vs03 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs03_5, 0), _mm512_extracti32x8_epi32(vs03_5, 1)); + vs13 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs13_5, 0), _mm512_extracti32x8_epi32(vs13_5, 1)); + vs23 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs23_5, 0), _mm512_extracti32x8_epi32(vs23_5, 1)); + } +#endif + */ + for (; k < vecsize; k += 32, rptr += 32 ) + { + __m256i w0 = _mm256_load_si256((const __m256i*)(wptr0 + k)); + __m256i w1 = _mm256_load_si256((const __m256i*)(wptr1 + k)); + __m256i w2 = _mm256_load_si256((const __m256i*)(wptr2 + k)); + __m256i r0 = _mm256_load_si256((const __m256i*)rptr); + + vs00 = _mm256_fmaddepi8_epi32(w0, r0, vs00); + vs10 = _mm256_fmaddepi8_epi32(w1, r0, vs10); + vs20 = _mm256_fmaddepi8_epi32(w2, r0, vs20); + + r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned)); + vs01 = _mm256_fmaddepi8_epi32(w0, r0, vs01); + vs11 = _mm256_fmaddepi8_epi32(w1, r0, vs11); + vs21 = _mm256_fmaddepi8_epi32(w2, r0, vs21); + + r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*2)); + vs02 = _mm256_fmaddepi8_epi32(w0, r0, vs02); + vs12 = _mm256_fmaddepi8_epi32(w1, r0, vs12); + vs22 = _mm256_fmaddepi8_epi32(w2, r0, vs22); + + r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*3)); + vs03 = _mm256_fmaddepi8_epi32(w0, r0, vs03); + vs13 = _mm256_fmaddepi8_epi32(w1, r0, vs13); + vs23 = _mm256_fmaddepi8_epi32(w2, r0, vs23); + } + + __m256i t0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs00, vs01), _mm256_hadd_epi32(vs02, vs03)); + __m256i t1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs10, vs11), _mm256_hadd_epi32(vs12, vs13)); + __m256i t2 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs20, vs21), _mm256_hadd_epi32(vs22, vs23)); + + t0 = _mm256_add_epi32(t0, _mm256_permute2x128_si256(t0, t0, 1)); + t1 = _mm256_add_epi32(t1, _mm256_permute2x128_si256(t1, t1, 1)); + t2 = _mm256_add_epi32(t2, _mm256_permute2x128_si256(t2, t2, 1)); + + __m128i s0, s1, s2; + + if( initOutput ) + { + s0 = _mm_set1_epi32(bias0); + s1 = _mm_set1_epi32(bias1); + s2 = _mm_set1_epi32(bias2); + } + else + { + s0 = _mm_loadu_si128((__m128i*)(outptr0 + j)); + s1 = _mm_loadu_si128((__m128i*)(outptr1 + j)); + s2 = _mm_loadu_si128((__m128i*)(outptr2 + j)); + } + + s0 = _mm_add_epi32(s0, _mm256_castsi256_si128(t0)); + s1 = _mm_add_epi32(s1, _mm256_castsi256_si128(t1)); + s2 = _mm_add_epi32(s2, _mm256_castsi256_si128(t2)); + + if( finalOutput ) + { + __m128i voutzp = _mm_set1_epi32(outZp); + __m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127); + s0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s0), _mm_set1_ps(mult0)))); + s1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s1), _mm_set1_ps(mult1)))); + s2 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s2), _mm_set1_ps(mult2)))); + + s0 = _mm_min_epi32(_mm_max_epi32(s0, outmin), outmax); + s1 = _mm_min_epi32(_mm_max_epi32(s1, outmin), outmax); + s2 = _mm_min_epi32(_mm_max_epi32(s2, outmin), outmax); + } + if( tail ) + { + s0 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr0 + j), _mm_castsi128_ps(s0), mask)); + s1 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr1 + j), _mm_castsi128_ps(s1), mask)); + s2 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr2 + j), _mm_castsi128_ps(s2), mask)); + } + _mm_storeu_si128((__m128i*)(outptr0 + j), s0); + _mm_storeu_si128((__m128i*)(outptr1 + j), s1); + _mm_storeu_si128((__m128i*)(outptr2 + j), s2); + } + + for( ; j <= blockSize - 2; j += 2 ) + { + const int8_t* rptr0 = rowbuf + j*vecsize_aligned; + const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned; + int s00, s01, s10, s11, s20, s21; + + if( initOutput ) + { + s00 = s01 = bias0; + s10 = s11 = bias1; + s20 = s21 = bias2; + } + else + { + s00 = outptr0[j]; s01 = outptr0[j+1]; + s10 = outptr1[j]; s11 = outptr1[j+1]; + s20 = outptr2[j]; s21 = outptr2[j+1]; + } + + for( int k = 0; k < vecsize; k++ ) + { + int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; + int8_t r = rptr0[k]; + s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r; + r = rptr1[k]; + s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r; + } + + if( finalOutput ) + { + s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127); + s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127); + s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127); + s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127); + s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127); + s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127); + } + outptr0[j] = s00; + outptr0[j+1] = s01; + outptr1[j] = s10; + outptr1[j+1] = s11; + outptr2[j] = s20; + outptr2[j+1] = s21; + } + + for( ; j < blockSize; j++ ) + { + const int8_t* rptr0 = rowbuf + j*vecsize_aligned; + int s00, s10, s20; + + if( initOutput ) + { + s00 = bias0; + s10 = bias1; + s20 = bias2; + } + else + { + s00 = outptr0[j]; + s10 = outptr1[j]; + s20 = outptr2[j]; + } + + for( int k = 0; k < vecsize; k++ ) + { + int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; + int8_t r = rptr0[k]; + s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r; + } + + if( finalOutput ) + { + s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127); + s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127); + s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127); + } + outptr0[j] = s00; + outptr1[j] = s10; + outptr2[j] = s20; + } + } + _mm256_zeroupper(); +} + +static inline void _mm256_expand_mul_add(const __m256i& a, const __m256i& b, + __m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3) +{ + __m256i a0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a)); + __m256i a1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a, 1)); + + __m256i b0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(b)); + __m256i b1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b, 1)); + + __m256i a0b0 = _mm256_mullo_epi16(a0, b0); + __m256i a1b1 = _mm256_mullo_epi16(a1, b1); + + out0 = _mm256_add_epi32(out0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a0b0))); + out1 = _mm256_add_epi32(out1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a0b0, 1))); + out2 = _mm256_add_epi32(out2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a1b1))); + out3 = _mm256_add_epi32(out3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a1b1, 1))); +} + +static inline void _mm256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b) +{ + __m256i t0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i t1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + + const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + __m256i p0 = _mm256_shuffle_epi8(t0, sh); + __m256i p1 = _mm256_shuffle_epi8(t1, sh); + __m256i lo = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i hi = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); + a = _mm256_unpacklo_epi64(lo, hi); + b = _mm256_unpackhi_epi64(lo, hi); +} + +void fastDepthwiseConv( const int8_t* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const int* biasptr, const float* multptr, + const int8_t* inptr_, + int height, int width, + int* outptr_, + int out_d, int outH, int outW, + int inpZp, int outZp) +{ + const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float mult = multptr[out_d]; + int bias = biasptr[out_d]; + int biasCopy; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const int8_t* imgptr0 = inptr_ + in_i*width; + const int8_t* imgptr1 = imgptr0 + dilation_h*width; + const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width; + int8_t w00 = w00_, w01 = w01_, w02 = w02_; + int8_t w20 = w20_, w21 = w21_, w22 = w22_; + int out; + biasCopy = bias; + if (in_i < 0) + { + biasCopy += inpZp * (w00 + w01 + w02); + w00 = w01 = w02 = 0; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + biasCopy += inpZp * (w20 + w21 + w22); + w20 = w21 = w22 = 0; + imgptr2 = imgptr1; + } + int* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 + + (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 + + (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 + + biasCopy + inpZp*(w00 + w10 + w20); + outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); + out_j = 1; + } + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + const int VECSZ = 32; + __m256i vw00 = _mm256_set1_epi8(w00), vw01 = _mm256_set1_epi8(w01), vw02 = _mm256_set1_epi8(w02), + vw10 = _mm256_set1_epi8(w10), vw11 = _mm256_set1_epi8(w11), vw12 = _mm256_set1_epi8(w12), + vw20 = _mm256_set1_epi8(w20), vw21 = _mm256_set1_epi8(w21), vw22 = _mm256_set1_epi8(w22); + __m256i vbias = _mm256_set1_epi32(biasCopy), voutzp = _mm256_set1_epi32(outZp), + outmin = _mm256_set1_epi32(-128), outmax = _mm256_set1_epi32(127); + __m256 vmult = _mm256_set1_ps(mult); + __m256i vout0, vout1, vout2, vout3; + + if( stride_w == 1 ) + { + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1) + { + if (out_j <= pad_l) + break; + out_j = outW1 - VECSZ; + } + int in_j = out_j * stride_w - pad_l; + __m256i v00 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j)), + v01 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w)), + v02 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w*2)), + v10 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j)), + v11 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w)), + v12 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w*2)), + v20 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j)), + v21 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w)), + v22 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w*2)); + + vout0 = vout1 = vout2 = vout3 = vbias; + _mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3); + + vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult))); + vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult))); + vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult))); + vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult))); + + vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax); + vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax); + vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax); + vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax); + + _mm256_storeu_si256((__m256i*)(outptr + out_j), vout0); + _mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1); + _mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2); + _mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3); + } + } + else + { + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1) + { + if (out_j <= pad_l) + break; + out_j = outW1 - VECSZ; + } + int in_j = out_j * stride_w - pad_l; + __m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; + _mm256_load_deinterleave(imgptr0 + in_j, v00, v01); + _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); + _mm256_load_deinterleave(imgptr1 + in_j, v10, v11); + _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); + _mm256_load_deinterleave(imgptr2 + in_j, v20, v21); + _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); + + vout0 = vout1 = vout2 = vout3 = vbias; + _mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3); + _mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3); + + vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult))); + vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult))); + vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult))); + vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult))); + + vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax); + vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax); + vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax); + vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax); + + _mm256_storeu_si256((__m256i*)(outptr + out_j), vout0); + _mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1); + _mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2); + _mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3); + } + } + } + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 + + (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 + + (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy; + outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + int s0 = 1, s1 = 1, s2 = 1; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0; + biasCopy += inpZp*(w00 + w10 + w20); + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0; + biasCopy += inpZp*(w01 + w11 + w21); + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0; + biasCopy += inpZp*(w02 + w12 + w22); + } + out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 + + (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 + + (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy; + outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); + } + } + _mm256_zeroupper(); +} + +// dst = vec * weights^t + bias +void fastGEMM1T( const int8_t* vec, const int8_t* weights, + size_t wstep, const int* bias, const float* multiplier, + int* dst, int nvecs, int vecsize, int outZp ) +{ + int i = 0; + + for( ; i <= nvecs - 8; i += 8 ) + { + const int8_t* wptr = weights + i*wstep; + __m256i vs0 = _mm256_setzero_si256(), vs1 = _mm256_setzero_si256(), + vs2 = _mm256_setzero_si256(), vs3 = _mm256_setzero_si256(), + vs4 = _mm256_setzero_si256(), vs5 = _mm256_setzero_si256(), + vs6 = _mm256_setzero_si256(), vs7 = _mm256_setzero_si256(); + + __m128i voutzp = _mm_set1_epi32(outZp); + __m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127); + + for( int k = 0; k < vecsize; k += 32, wptr += 32 ) + { + __m256i v = _mm256_load_si256((const __m256i*)(vec + k)); + + vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0); + vs1 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep)), v, vs1); + vs2 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*2)), v, vs2); + vs3 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*3)), v, vs3); + vs4 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*4)), v, vs4); + vs5 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*5)), v, vs5); + vs6 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*6)), v, vs6); + vs7 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*7)), v, vs7); + } + + __m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs1), _mm256_hadd_epi32(vs2, vs3)); + __m256i s1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs4, vs5), _mm256_hadd_epi32(vs6, vs7)); + + s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1)); + s1 = _mm256_add_epi32(s1, _mm256_permute2x128_si256(s1, s1, 1)); + + __m128i t0 = _mm_add_epi32(_mm256_castsi256_si128(s0), _mm_loadu_si128((__m128i*)(bias + i))); + __m128i t1 = _mm_add_epi32(_mm256_castsi256_si128(s1), _mm_loadu_si128((__m128i*)(bias + i + 4))); + + t0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t0), _mm_loadu_ps(multiplier + i)))); + t1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t1), _mm_loadu_ps(multiplier + i + 4)))); + + t0 = _mm_min_epi32(_mm_max_epi32(t0, outmin), outmax); + t1 = _mm_min_epi32(_mm_max_epi32(t1, outmin), outmax); + + _mm_storeu_si128((__m128i*)(dst + i), t0); + _mm_storeu_si128((__m128i*)(dst + i + 4), t1); + } + + for( ; i < nvecs; i++ ) + { + const int8_t* wptr = weights + i*wstep; + __m256i vs0 = _mm256_setzero_si256(); + + for( int k = 0; k < vecsize; k += 32, wptr += 32 ) + { + __m256i v = _mm256_load_si256((const __m256i*)(vec + k)); + vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0); + } + + __m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs0), vs0); + s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1)); + int temp = _mm_extract_epi32(_mm256_castsi256_si128(s0), 0); + dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]); + } + + _mm256_zeroupper(); +} +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace diff --git a/modules/dnn/src/int8layers/pooling_layer.cpp b/modules/dnn/src/int8layers/pooling_layer.cpp new file mode 100644 index 0000000000..20a0486a46 --- /dev/null +++ b/modules/dnn/src/int8layers/pooling_layer.cpp @@ -0,0 +1,595 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" +#include "opencv2/core/hal/intrin.hpp" + +#include +#include +#include +using std::max; +using std::min; + +namespace cv +{ +namespace dnn +{ + +class PoolingLayerInt8Impl CV_FINAL : public PoolingLayerInt8 +{ +public: + PoolingLayerInt8Impl(const LayerParams& params) + { + computeMaxIdx = false; + globalPooling = false; + isGlobalPooling = std::vector(3, false); + output_zp = params.get("zeropoints"); + input_zp = params.get("input_zeropoint", 0); + multiplier = params.get("multiplier", 1.f); + + hasDynamicShapes = params.get("has_dynamic_shapes", false); + shapesInitialized = !hasDynamicShapes; + + if (params.has("pool") || params.has("kernel_size") || + params.has("kernel_w") || params.has("kernel_h")) + { + String pool = toLowerCase(params.get("pool", "max")); + if (pool == "max") + type = MAX; + else if (pool == "ave") + type = AVE; + else if (pool == "sum") + type = SUM; + else + CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\""); + + getPoolingKernelParams(params, kernel_size, isGlobalPooling, pads_begin, pads_end, strides, padMode); + globalPooling = isGlobalPooling[0] || isGlobalPooling[1] || isGlobalPooling[2]; + } + else + CV_Error(Error::StsBadArg, "Cannot determine pooling type"); + setParamsFrom(params); + ceilMode = params.get("ceil_mode", true); + spatialScale = params.get("spatial_scale", 1); + avePoolPaddedArea = params.get("ave_pool_padded_area", true); + } + + void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE + { + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + CV_Assert(!inputs.empty()); + CV_Assert(outputs.size() == 1); + + std::vector inp; + std::vector out; + for (int i = 2; i < inputs[0].dims; i++) { + inp.push_back(inputs[0].size[i]); + out.push_back(outputs[0].size[i]); + } + if (globalPooling) { + std::vector finalKernel; + for (int i = 0; i < inp.size(); i++) { + int idx = isGlobalPooling.size() - inp.size() + i; + finalKernel.push_back(isGlobalPooling[idx] ? inp[i] : kernel_size[idx]); + } + kernel_size = finalKernel; + } + + getConvPoolPaddings(inp, kernel_size, strides, padMode, pads_begin, pads_end); + + if (inputs[0].dims == 3) + { + // Pool1D + kernel_size.assign(1, kernel_size[0]); + strides.assign(1, strides[0]); + pads_begin.assign(1, pads_begin[0]); + pads_end.assign(1, pads_end[0]); + } + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + if (backendId == DNN_BACKEND_OPENCV) + { + if (kernel_size.size() == 3) + return preferableTarget == DNN_TARGET_CPU; + if (kernel_size.size() <= 2) + return true; + else + return false; + } + return false; + } + + bool setActivation(const Ptr& layer) CV_OVERRIDE + { + Ptr activ_int8 = layer.dynamicCast(); + if (!activ_int8.empty()) + { + return activ_int8->blobs.empty(); + } + return false; + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + switch (type) + { + case MAX: + { + CV_Assert_N(inputs.size() == 1, outputs.size() == 1); + maxPooling(inputs[0], outputs[0]); + break; + } + case AVE: case SUM: + CV_Assert_N(inputs.size() == 1, outputs.size() == 1); + avePooling(inputs[0], outputs[0]); + break; + default: + CV_Error(Error::StsNotImplemented, "Not implemented"); + break; + } + } + + class PoolingInvoker : public ParallelLoopBody + { + public: + const Mat* src, *rois; + Mat *dst; + int pad_l, pad_t, pad_r, pad_b; + bool avePoolPaddedArea; + int nstripes, inpZp, outZp; + std::vector ofsbuf; + int poolingType; + float multiplier; + float spatialScale; + + std::vector pads_begin, pads_end; + std::vector kernel_size; + std::vector strides; + + PoolingInvoker() : src(0), rois(0), dst(0), pad_l(0), pad_t(0), pad_r(0), pad_b(0), + avePoolPaddedArea(false), nstripes(0), inpZp(0), outZp(0), + poolingType(MAX), multiplier(1), spatialScale(0){} + + static void run(const Mat& src, const Mat& rois, Mat& dst, + std::vector kernel_size, std::vector strides, + std::vector pads_begin, std::vector pads_end, + bool avePoolPaddedArea, int poolingType, float spatialScale, + float multiplier, int inpZp, int outZp, int nstripes) + { + CV_Assert_N( + src.isContinuous(), dst.isContinuous(), + src.type() == CV_8S, src.type() == dst.type(), + src.dims == 3 || src.dims == 4 || src.dims == 5, dst.dims == 3 || dst.dims == 4 || dst.dims == 5, + src.size[0] == dst.size[0], src.size[1] == dst.size[1], rois.empty()); + + PoolingInvoker p; + + bool isPool1D = src.dims == 3; + bool isPool3D = src.dims == 5; + + p.src = &src; + p.rois = &rois; + p.dst = &dst; + + p.kernel_size = kernel_size; + p.strides = strides; + p.pads_begin = pads_begin; + p.pads_end = pads_end; + + p.pad_l = pads_begin.back(); + p.pad_t = isPool1D ? 0 : pads_begin[pads_begin.size() - 2]; + p.pad_r = pads_end.back(); + p.pad_b = isPool1D ? 0 : pads_end[pads_end.size() - 2]; + + p.avePoolPaddedArea = avePoolPaddedArea; + p.nstripes = nstripes; + p.inpZp = inpZp; + p.outZp = outZp; + p.poolingType = poolingType; + p.spatialScale = spatialScale; + p.multiplier = multiplier; + + int height = isPool1D ? 1 : src.size[src.dims - 2]; + int width = src.size[src.dims - 1]; + + int kernel_d = isPool3D ? kernel_size[0] : 1; + int kernel_h = isPool1D ? 1 : kernel_size[kernel_size.size() - 2]; + int kernel_w = kernel_size.back(); + + p.ofsbuf.resize(kernel_d * kernel_h * kernel_w); + for (int i = 0; i < kernel_d; ++i) { + for (int j = 0; j < kernel_h; ++j) { + for (int k = 0; k < kernel_w; ++k) { + p.ofsbuf[i * kernel_h * kernel_w + j * kernel_w + k] = width * height * i + width * j + k; + } + } + } + + parallel_for_(Range(0, nstripes), p, nstripes); + } + + void operator()(const Range& r) const CV_OVERRIDE + { + int channels = dst->size[1]; + + bool isPool3D = src->dims == 5; + bool isPool2D = src->dims == 4; + bool isPool1D = src->dims == 3; + int depth = isPool3D? dst->size[2] : 1; + int height = isPool1D? 1 : dst->size[dst->dims - 2]; + int width = dst->size[dst->dims - 1]; + + int inp_depth = isPool3D? src->size[2] : 1; + int inp_height = isPool1D? 1 : src->size[src->dims - 2]; + int inp_width = src->size[src->dims - 1]; + + size_t total = dst->total(); + size_t stripeSize = (total + nstripes - 1)/nstripes; + size_t stripeStart = r.start*stripeSize; + size_t stripeEnd = std::min(r.end*stripeSize, total); + + int kernel_d = isPool3D? kernel_size[0] : 1; + int kernel_h = isPool1D? 1 : kernel_size[kernel_size.size() - 2]; + int kernel_w = kernel_size.back(); + + int stride_d = isPool3D? strides[0] : 0; + int stride_h = isPool1D? 1 :strides[strides.size() - 2]; + int stride_w = strides.back(); + +#if CV_SIMD128 + const int* ofsptr = (const int*)&ofsbuf[0]; + if (poolingType == MAX && !ofsptr) + CV_Error(Error::StsBadArg, "ofsbuf should be initialized in this mode"); +#endif + + for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; ) + { + size_t ofs = ofs0; + int x0 = (int)(ofs % width); + ofs /= width; + int y0 = (int)(ofs % height); + ofs /= height; + + int d0 = (int)(ofs % depth); + ofs /= depth; + + int c = (int)(ofs % channels); + int n = (int)(ofs / channels); + int ystart, yend; + int dstart = 0, dend = 1; + + const int8_t *srcData = 0; + int pad_d_begin = (pads_begin.size() == 3) ? pads_begin[0] : 0; + dstart = d0 * stride_d - pad_d_begin; + dend = min(dstart + kernel_d, (int)(inp_depth + pads_end[0])); + + ystart = y0 * stride_h - pad_t; + yend = min(ystart + kernel_h, inp_height + pad_b); + srcData = src->ptr(n, c); + + int ddelta = dend - dstart; + dstart = max(dstart, 0); + dend = min(dend, inp_depth); + int ydelta = yend - ystart; + ystart = max(ystart, 0); + yend = min(yend, inp_height); + int8_t *dstData = &dst->ptr(n, c, d0)[y0 * width]; + + int delta = std::min((int)(stripeEnd - ofs0), width - x0); + ofs0 += delta; + int x1 = x0 + delta; + + if( poolingType == MAX ) + for( ; x0 < x1; x0++ ) + { + int xstart = x0 * stride_w - pad_l; + int xend = min(xstart + kernel_w, inp_width); + xstart = max(xstart, 0); + if (xstart >= xend || ystart >= yend) + { + dstData[x0] = (int8_t)outZp; + continue; + } +#if CV_SIMD128 + if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width ) + { + v_int8x16 max_val0 = v_setall_s8(-128); + if( yend - ystart == kernel_h ) + { + const int8_t* srcData1 = srcData + ystart*inp_width + xstart; + if( stride_w == 1 ) + for (int k = 0; k < kernel_w*kernel_h; k++) + { + int index = ofsptr[k]; + v_int8x16 v0 = v_load(srcData1 + index); + max_val0 = v_max(max_val0, v0); + } + else if( stride_w == 2 ) + for (int k = 0; k < kernel_w*kernel_h; k++) + { + int index = ofsptr[k]; + v_int8x16 v0, dummy; + v_load_deinterleave(srcData1 + index, v0, dummy); + max_val0 = v_max(max_val0, v0); + } + else + for (int k = 0; k < kernel_w*kernel_h; k++) + { + int index = ofsptr[k]; + v_int8x16 v0(srcData1[index], srcData1[index + stride_w], + srcData1[index + stride_w*2], srcData1[index + stride_w*3], + srcData1[index + stride_w*4], srcData1[index + stride_w*5], + srcData1[index + stride_w*6], srcData1[index + stride_w*7], + srcData1[index + stride_w*8], srcData1[index + stride_w*9], + srcData1[index + stride_w*10], srcData1[index + stride_w*11], + srcData1[index + stride_w*12], srcData1[index + stride_w*13], + srcData1[index + stride_w*14], srcData1[index + stride_w*15]); + max_val0 = v_max(max_val0, v0); + } + } + else + { + for (int y = ystart; y < yend; ++y) + { + for (int x = xstart; x < xend; ++x) + { + const int index = y * inp_width + x; + v_int8x16 v0(srcData[index], srcData[index + stride_w], + srcData[index + stride_w*2], srcData[index + stride_w*3], + srcData[index + stride_w*4], srcData[index + stride_w*5], + srcData[index + stride_w*6], srcData[index + stride_w*7], + srcData[index + stride_w*8], srcData[index + stride_w*9], + srcData[index + stride_w*10], srcData[index + stride_w*11], + srcData[index + stride_w*12], srcData[index + stride_w*13], + srcData[index + stride_w*14], srcData[index + stride_w*15]); + max_val0 = v_max(max_val0, v0); + } + } + } + v_store(dstData + x0, max_val0); + x0 += 15; + } + else +#else + CV_UNUSED(isPool2D); +#endif + if( isPool1D ) + { + const int8_t* first = srcData + xstart; + const int8_t* last = srcData + xend; + const int8_t* max_elem = std::max_element(first, last); + if (max_elem != last) + dstData[x0] = *max_elem; + } + else + { + int8_t max_val = -128; + for (int d = dstart; d < dend; ++d) { + for (int y = ystart; y < yend; ++y) { + for (int x = xstart; x < xend; ++x) { + const int index = d * inp_width * inp_height + y * inp_width + x; + int8_t val = srcData[index]; + max_val = std::max(max_val, val); + } + } + } + dstData[x0] = max_val; + } + } + else if (poolingType == AVE || poolingType == SUM) + { + for( ; x0 < x1; ++x0) + { + int xstart = x0 * stride_w - pad_l; + int xend = min(xstart + kernel_w, inp_width + pad_r); + int xdelta = xend - xstart; + xstart = max(xstart, 0); + xend = min(xend, inp_width); + + int real_kernel_area = (dend - dstart) * (yend - ystart) * (xend - xstart); + int padded_kernel_area = xdelta * ydelta * ddelta; + int kernel_area = avePoolPaddedArea ? padded_kernel_area : real_kernel_area; + + int bias = (avePoolPaddedArea ? (padded_kernel_area - real_kernel_area) * inpZp : 0) + - (inpZp * kernel_area); + float inv_kernel_area = poolingType == AVE ? multiplier / kernel_area : multiplier; +#if CV_SIMD128 + if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width ) + { + v_int32x4 sum_val0 = v_setall_s32(bias), sum_val1 = v_setall_s32(bias), + sum_val2 = v_setall_s32(bias), sum_val3 = v_setall_s32(bias), + voutzp = v_setall_s32(outZp); + v_float32x4 ikarea = v_setall_f32(inv_kernel_area); + + for (int y = ystart; y < yend; ++y) + { + for (int x = xstart; x < xend; ++x) + { + const int index = y * inp_width + x; + v_int32x4 v0((int)srcData[index], (int)srcData[index + stride_w], + (int)srcData[index + stride_w*2], (int)srcData[index + stride_w*3]); + v_int32x4 v1((int)srcData[index + stride_w*4], (int)srcData[index + stride_w*5], + (int)srcData[index + stride_w*6], (int)srcData[index + stride_w*7]); + v_int32x4 v2((int)srcData[index + stride_w*8], (int)srcData[index + stride_w*9], + (int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]); + v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13], + (int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]); + sum_val0 += v0; + sum_val1 += v1; + sum_val2 += v2; + sum_val3 += v3; + } + } + + sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp; + sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp; + sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp; + sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp; + + v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3))); + x0 += 15; + } + else +#endif + if( isPool1D ) + { + const int8_t* first = srcData + xstart; + const int8_t* last = srcData + xend; + int sum_val = bias + std::accumulate(first, last, 0); + dstData[x0] = saturate_cast(outZp + std::round(sum_val*inv_kernel_area)); + } + else + { + int sum_val = bias; + for (int d = dstart; d < dend; ++d) { + for (int y = ystart; y < yend; ++y) { + for (int x = xstart; x < xend; ++x) { + const int index = d * inp_width * inp_height + y * inp_width + x; + int8_t val = srcData[index]; + sum_val += (int)val; + } + } + } + dstData[x0] = saturate_cast(outZp + std::round(sum_val*inv_kernel_area)); + } + } + } + } + } + }; + + void maxPooling(Mat &src, Mat &dst) + { + const int nstripes = getNumThreads(); + Mat rois; + PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, + spatialScale, multiplier, input_zp, output_zp, nstripes); + } + + void avePooling(Mat &src, Mat &dst) + { + const int nstripes = getNumThreads(); + Mat rois; + PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, + spatialScale, multiplier, input_zp, output_zp, nstripes); + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + CV_Assert(inputs.size() != 0); + + bool isPool1D = inputs[0].size() == 3; + std::vector inpShape(inputs[0].begin() + 2, inputs[0].end()); + std::vector outShape(inputs[0].begin(), inputs[0].begin() + 2); + + std::vector local_kernel; + if (globalPooling) { + for (int i = 0; i < inpShape.size(); i++) { + int idx = isGlobalPooling.size() - inpShape.size() + i; + local_kernel.push_back(isGlobalPooling[idx] ? inpShape[i] : kernel_size[idx]); + } + } else { + local_kernel = kernel_size; + } + + if (hasDynamicShapes && !shapesInitialized) + { + //Just copy input shapes for width and height to prevent errors on loading stage + for (int i = 0; i < inpShape.size(); i++) + outShape.push_back(inpShape[i]); + } + else if (padMode.empty()) + { + int addedDims = isPool1D? inpShape.size() : local_kernel.size(); + for (int i = 0; i < addedDims; i++) { + float dst = (float) (inpShape[i] + pads_begin[i] + pads_end[i] - local_kernel[i]) / strides[i]; + outShape.push_back(1 + (ceilMode ? ceil(dst) : floor(dst))); + } + + // If we have padding, ensure that the last pooling starts strictly + // inside the image (instead of at the padding); otherwise clip the last. + for (int i = 0; i < addedDims; i++) { + if (pads_end[i] && (outShape[2 + i] - 1) * strides[i] >= inpShape[i] + pads_end[i]) { + --outShape[2 + i]; + CV_Assert((outShape[2 + i] - 1) * strides[i] < inpShape[i] + pads_end[i]); + } + } + } + else { + getConvPoolOutParams(inpShape, local_kernel, strides, padMode, + std::vector(local_kernel.size(), 1), outShape); + } + + outputs.assign(1, outShape); + return false; + } + + bool updateMemoryShapes(const std::vector &inputs) CV_OVERRIDE + { + int dims = inputs[0].size(); + CV_Assert(inputs[0][dims - 1] > 0 && inputs[0][dims - 2] > 0); + shapesInitialized = true; + return true; + } + + virtual int64 getFLOPS(const std::vector &inputs, + const std::vector &outputs) const CV_OVERRIDE + { + CV_UNUSED(inputs); // suppress unused variable warning + long flops = 0; + bool isPool1D = inputs[0].size() == 3; + size_t karea = std::accumulate(kernel_size.begin(), isPool1D? kernel_size.begin() + 1 : kernel_size.end(), + 1, std::multiplies()); + for(int i = 0; i < outputs.size(); i++) + { + if (type == MAX) + { + if (i%2 == 0) + flops += total(outputs[i])*karea; + } + else + { + flops += total(outputs[i])*(karea + 1); + } + } + return flops; + } +private: + enum Type + { + MAX, + AVE, + STOCHASTIC, + SUM, + ROI, // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf + PSROI // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf + }; + bool hasDynamicShapes; + bool shapesInitialized; + float multiplier; +}; + +Ptr PoolingLayerInt8::create(const LayerParams& params) +{ + return Ptr(new PoolingLayerInt8Impl(params)); +} + +} +} diff --git a/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp b/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp new file mode 100644 index 0000000000..2ddb76a0e8 --- /dev/null +++ b/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp @@ -0,0 +1,157 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" + +namespace cv +{ +namespace dnn +{ + +class QuantizeLayerImpl CV_FINAL : public QuantizeLayer +{ +public: + QuantizeLayerImpl(const LayerParams& params) + { + scale = params.get("scales", 1.0f); + zeropoint = params.get("zeropoints", 0); + setParamsFrom(params); + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV; + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + CV_Assert(inputs.size() == 1); + Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); + return false; + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE + { + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + } + +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) + { + std::vector inputs, outputs; + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + + if (inputs_.depth() == CV_16S) + { + UMat inputFp32(shape(inputs[0]), CV_32F); + convertFp16(inputs[0], inputFp32); + inputFp32.copyTo(inputs[0]); + } + + inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint); + return true; + } +#endif + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint); + } +}; + +class DequantizeLayerImpl CV_FINAL : public DequantizeLayer +{ +public: + DequantizeLayerImpl(const LayerParams& params) + { + scale = params.get("scales", 1.0f); + zeropoint = params.get("zeropoints", 0); + setParamsFrom(params); + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV; + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + CV_Assert(inputs.size() == 1); + Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); + return false; + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE + { + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + } + +#ifdef HAVE_OPENCL + bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) + { + std::vector inputs, outputs; + inputs_.getUMatVector(inputs); + outputs_.getUMatVector(outputs); + + UMat outputFp32(shape(outputs[0]), CV_32F); + inputs[0].convertTo(outputFp32, CV_32F, scale, -(scale*zeropoint)); + + if (outputs_.depth() == CV_16S) + convertFp16(outputFp32, outputs[0]); + else + outputFp32.copyTo(outputs[0]); + return true; + } +#endif + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + inputs[0].convertTo(outputs[0], CV_32F, scale, -(scale*zeropoint)); + } +}; + +Ptr QuantizeLayer::create(const LayerParams& params) +{ + return Ptr(new QuantizeLayerImpl(params)); +} + +Ptr DequantizeLayer::create(const LayerParams& params) +{ + return Ptr(new DequantizeLayerImpl(params)); +} + +} +} diff --git a/modules/dnn/src/int8layers/scale_layer.cpp b/modules/dnn/src/int8layers/scale_layer.cpp new file mode 100644 index 0000000000..d7f676d047 --- /dev/null +++ b/modules/dnn/src/int8layers/scale_layer.cpp @@ -0,0 +1,211 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" +#include +#include + +namespace cv +{ +namespace dnn +{ + +class ScaleLayerInt8Impl CV_FINAL : public ScaleLayerInt8 +{ +public: + Mat weights, bias; + ScaleLayerInt8Impl(const LayerParams& params) + { + setParamsFrom(params); + hasBias = params.get("bias_term", false); + axis = params.get("axis", 1); + hasWeights = false; + + output_sc = params.get("scales"); + output_zp = params.get("zeropoints"); + + DictValue inpSc = params.get("input_scales"); + DictValue inpZp = params.get("input_zeropoints"); + + for (int i = 0; i < inpSc.size(); i++) + { + inp_sc.push_back(inpSc.get(i)); + inp_zp.push_back(inpZp.get(i)); + } + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + outputs.assign(1, inputs[0]); + return true; + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE + { + std::vector inputs; + inputs_arr.getMatVector(inputs); + hasWeights = blobs.size() == 2 || (blobs.size() <= 1 && !hasBias); + CV_Assert((inputs.size() == 2 && blobs.empty()) || blobs.size() == (int)hasWeights + (int)hasBias); + + if (!blobs.empty()) + { + Mat w = hasWeights ? blobs[0] : Mat::ones(blobs[0].size(), CV_32F); + Mat b = hasBias ? blobs.back() : Mat::zeros(blobs.back().size(), CV_32F); + + w = w.reshape(1, 1); + b = b.reshape(1, 1); + + w.convertTo(weights, CV_32F, inp_sc[0]/output_sc); + addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F); + } + else + { + // initialized during forward() + weights = Mat(); bias = Mat(); + } + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV; + } + + bool setActivation(const Ptr& layer) CV_OVERRIDE + { + Ptr activ_int8 = layer.dynamicCast(); + if (!activ_int8.empty()) + { + return activ_int8->blobs.empty(); + } + return false; + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + Mat &inpBlob = inputs[0]; + Mat &outBlob = outputs[0]; + + if (blobs.empty()) + { + CV_Assert(inp_sc.size() == 2 && inp_zp.size() == 2); + Mat inp_dequantized, w, b; + inputs[1].reshape(1, 1).convertTo(inp_dequantized, CV_32F, inp_sc[1], -(inp_sc[1]*inp_zp[1])); + w = hasWeights ? inp_dequantized : Mat::ones(inp_dequantized.size(), CV_32F); + b = hasBias ? inp_dequantized : Mat::zeros(inp_dequantized.size(), CV_32F); + + w.convertTo(weights, CV_32F, inp_sc[0]/output_sc); + addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F); + } + + MatShape inpShape = shape(inpBlob); + const int numWeights = weights.total(); + CV_Assert(numWeights != 0); + CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs"); + + int endAxis; + for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis) + { + if (total(inpShape, axis, endAxis) == numWeights) + break; + } + CV_Assert(total(inpShape, axis, endAxis) == numWeights); + CV_CheckTypeEQ(inpBlob.type(), CV_8SC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_8SC1, ""); + + int numSlices = total(inpShape, 0, axis); + int8_t* inpData = (int8_t*)inpBlob.data; + int8_t* outData = (int8_t*)outBlob.data; + + if (endAxis != inpBlob.dims) + { + float* weightsData = (float*)weights.data; + float* biasesData = (float*)bias.data; + int spatialSize = total(inpShape, endAxis); // spatialSize != 1 + for (int i = 0; i < numSlices; ++i) + { + for (int j = 0; j < numWeights; ++j) + { + float w = weightsData[j]; + float b = biasesData[j]; + Mat inpSlice(1, spatialSize, CV_8S, inpData); + Mat outSlice(1, spatialSize, CV_8S, outData); + inpSlice.convertTo(outSlice, CV_8S, w, b); + inpData += spatialSize; + outData += spatialSize; + } + } + } + else + { + for (int i = 0; i < numSlices; ++i) + { + Mat inpSlice(1, numWeights, CV_8S, inpData); + Mat outSlice(1, numWeights, CV_8S, outData); + + multiply(inpSlice, weights, outSlice, 1.0, CV_8S); + add(outSlice, bias, outSlice, Mat(), CV_8S); + + inpData += numWeights; + outData += numWeights; + } + } + } + + void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE + { + scale = (hasWeights && !blobs.empty()) ? blobs[0] : Mat(); + shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat(); + } + + void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE + { + scale = output_sc; + zeropoint = output_zp; + } + + virtual int64 getFLOPS(const std::vector &inputs, + const std::vector &outputs) const CV_OVERRIDE + { + CV_UNUSED(outputs); // suppress unused variable warning + long flops = 0; + for(int i = 0; i < inputs.size(); i++) + { + flops += 2*total(inputs[i]); + } + return flops; + } + +private: + bool hasWeights; + std::vector inp_sc; + std::vector inp_zp; +}; + + +Ptr ScaleLayerInt8::create(const LayerParams& params) +{ + return Ptr(new ScaleLayerInt8Impl(params)); +} + +Ptr ShiftLayerInt8::create(const LayerParams& params) +{ + LayerParams scaleParams = params; + scaleParams.type = "ScaleInt8"; + scaleParams.set("bias_term", true); + scaleParams.set("axis", 0); + return Ptr(new ScaleLayerInt8Impl(scaleParams)); +} + +} // namespace dnn +} // namespace cv diff --git a/modules/dnn/src/int8layers/softmax_layer.cpp b/modules/dnn/src/int8layers/softmax_layer.cpp new file mode 100644 index 0000000000..7e3c82bc21 --- /dev/null +++ b/modules/dnn/src/int8layers/softmax_layer.cpp @@ -0,0 +1,176 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "../precomp.hpp" +#include "layers_common.hpp" + +#include +#include + +namespace cv +{ +namespace dnn +{ + +class SoftMaxLayerInt8Impl CV_FINAL : public SoftmaxLayerInt8 +{ +public: + + SoftMaxLayerInt8Impl(const LayerParams& params) + { + axisRaw = params.get("axis", 1); + logSoftMax = params.get("log_softmax", false); + output_sc = params.get("scales"); + output_zp = params.get("zeropoints"); + setParamsFrom(params); + } + + bool getMemoryShapes(const std::vector &inputs, + const int requiredOutputs, + std::vector &outputs, + std::vector &internals) const CV_OVERRIDE + { + bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); + MatShape shape = inputs[0]; + int cAxis = normalize_axis(axisRaw, shape.size()); + shape[cAxis] = 1; + internals.assign(1, shape); + return inplace; + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV; + } + + virtual bool tryFuse(Ptr& top) CV_OVERRIDE + { + Ptr dequantize_layer = top.dynamicCast(); + return !dequantize_layer.empty() && preferableTarget != DNN_TARGET_OPENCL_FP16; + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + std::vector inputs, outputs, internals; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + internals_arr.getMatVector(internals); + + const Mat &src = inputs[0]; + Mat &dst = outputs[0]; + + int axis = normalize_axis(axisRaw, src.dims); + size_t outerSize = src.total(0, axis), channels = src.size[axis], + innerSize = src.total(axis + 1); + + CV_Assert(src.type() == CV_8S && (dst.type() == CV_8S || dst.type() == CV_32F)); + CV_Assert(src.isContinuous() && dst.isContinuous()); + + size_t outerStep = src.total(axis); + size_t cnStep = src.total(axis + 1); + const int8_t *srcPtr = src.ptr(); + const float *expPtr = blobs[0].ptr(); + + if (dst.type() == CV_32F) + { + float *dstPtr = dst.ptr(); + for (size_t outerDim = 0; outerDim < outerSize; outerDim++) + { + size_t srcOffset = outerDim * outerStep; + std::vector expSum(innerSize, 0.f); + + // sum exp along axis + for (size_t cnDim = 0; cnDim < channels; cnDim++) + { + const int offset = srcOffset + cnDim * cnStep; + for (size_t i = 0; i < innerSize; i++) + expSum[i] += expPtr[srcPtr[offset + i] + 128]; + } + + // divide by computed sum + for (size_t cnDim = 0; cnDim < channels; cnDim++) + { + const int offset = srcOffset + cnDim * cnStep; + for (size_t i = 0; i < innerSize; i++) + dstPtr[offset + i] = expPtr[srcPtr[offset + i] + 128]/expSum[i]; + } + + if (logSoftMax) + { + for (size_t cnDim = 0; cnDim < channels; cnDim++) + { + const int offset = srcOffset + cnDim * cnStep; + for (size_t i = 0; i < innerSize; i++) + dstPtr[offset + i] = log(dstPtr[offset + i]); + } + } + } + } + else + { + const float inv_scale = 1.f/output_sc; + int8_t *dstPtr = dst.ptr(); + for (size_t outerDim = 0; outerDim < outerSize; outerDim++) + { + size_t srcOffset = outerDim * outerStep; + std::vector expSum(innerSize, 0.f); + + // sum exp along axis + for (size_t cnDim = 0; cnDim < channels; cnDim++) + { + const int offset = srcOffset + cnDim * cnStep; + for (size_t i = 0; i < innerSize; i++) + expSum[i] += expPtr[srcPtr[offset + i] + 128]; + } + + // divide by computed sum and quantize to int8 + if (logSoftMax) + { + for (size_t cnDim = 0; cnDim < channels; cnDim++) + { + const int offset = srcOffset + cnDim * cnStep; + for (size_t i = 0; i < innerSize; i++) + dstPtr[offset + i] = saturate_cast(output_zp + std::round(inv_scale*log(expPtr[srcPtr[offset + i] + 128]/expSum[i]))); + } + } + else + { + for (size_t cnDim = 0; cnDim < channels; cnDim++) + { + const int offset = srcOffset + cnDim * cnStep; + for (size_t i = 0; i < innerSize; i++) + dstPtr[offset + i] = saturate_cast(output_zp + std::round(inv_scale*(expPtr[srcPtr[offset + i] + 128]/expSum[i]))); + } + } + } + } + } + + int64 getFLOPS(const std::vector &inputs, + const std::vector &outputs) const CV_OVERRIDE + { + CV_UNUSED(outputs); // suppress unused variable warning + int64 flops = 0; + + for (int i = 0; i < inputs.size(); i++) + { + flops += 4*total(inputs[i]); + } + + return flops; + } + + int axisRaw; +}; + +Ptr SoftmaxLayerInt8::create(const LayerParams& params) +{ + return Ptr(new SoftMaxLayerInt8Impl(params)); +} + +} +} diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index e28c964689..49804c5c13 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -409,6 +409,18 @@ public: } #endif // HAVE_DNN_NGRAPH + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + params.set("input_scale", scales[0][0]); + params.set("input_zeropoint", zeropoints[0][0]); + + params.blobs.clear(); + params.blobs.push_back(origin_weights); + params.blobs.push_back(origin_bias); + return true; + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 5f93b45886..59548a9c0c 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -166,6 +166,11 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + return true; + } }; Ptr BlankLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index a950c56167..536114fcd7 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -70,6 +70,7 @@ public: setParamsFrom(params); axis = params.get("axis", 1); padding = params.get("padding", false); + paddingValue = params.get("padding_value", 0); } virtual bool getMemoryShapes(const std::vector &inputs, @@ -119,13 +120,14 @@ public: (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding); } + template class ChannelConcatInvoker : public ParallelLoopBody { public: std::vector* inputs; Mat* output; int nstripes; - std::vector chptrs; + std::vector chptrs; static void run(std::vector& inputs, Mat& output, int nstripes) { @@ -139,14 +141,14 @@ public: for( i = 0; i < ninputs; i++ ) { Mat& inp = inputs[i]; - CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) && + CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) && inp.dims == 4 && inp.size[0] == output.size[0] && inp.size[2] == output.size[2] && inp.size[3] == output.size[3] ); nchannels += inp.size[1]; } CV_Assert( nchannels == output.size[1] ); - CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) ); + CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) ); cc.chptrs.resize(nchannels*batchsz); @@ -157,7 +159,7 @@ public: for( int j = 0; j < batchsz; j++ ) for( int k = 0; k < inp.size[1]; k++ ) { - const float* ptr = inp.ptr(j, k); + const T* ptr = inp.ptr(j, k); cc.chptrs[ofs + j*nchannels + k] = ptr; } ofs += inp.size[1]; @@ -176,8 +178,8 @@ public: size_t stripeSize = (total + nstripes - 1)/nstripes; size_t stripeStart = r.start*stripeSize; size_t stripeEnd = std::min(total, r.end*stripeSize); - const float** ptrs = (const float**)&chptrs[0]; - float* outptr = output->ptr(); + const T** ptrs = (const T**)&chptrs[0]; + T* outptr = output->ptr(); size_t blockSize0 = 1 << 16; for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; ) @@ -248,7 +250,8 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && + inputs_arr.depth() != CV_8S, forward_ocl(inputs_arr, outputs_arr, internals_arr)) std::vector inputs, outputs; @@ -259,12 +262,15 @@ public: Mat& outMat = outputs[0]; if (padding) - outMat.setTo(0); + outMat.setTo(paddingValue); if( cAxis == 1 && outMat.dims == 4 && !padding) { int nstripes = getNumThreads(); - ChannelConcatInvoker::run(inputs, outMat, nstripes); + if (outMat.type() == CV_8S) + ChannelConcatInvoker::run(inputs, outMat, nstripes); + else + ChannelConcatInvoker::run(inputs, outMat, nstripes); } else { @@ -394,6 +400,14 @@ public: return Ptr(new InfEngineNgraphNode(concat)); } #endif // HAVE_DNN_NGRAPH + + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + if (padding) + params.set("padding_value", zeropoints[1][0]); + return true; + } }; Ptr ConcatLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp index bbea3e3f2c..18f190b36b 100644 --- a/modules/dnn/src/layers/const_layer.cpp +++ b/modules/dnn/src/layers/const_layer.cpp @@ -112,6 +112,15 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + Mat quantizedBlob; + blobs[0].convertTo(quantizedBlob, CV_8S, 1.f/scales[1][0], zeropoints[1][0]); + params.blobs.clear(); + params.blobs.push_back(quantizedBlob); + return true; + } }; Ptr ConstLayer::create(const LayerParams& params) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 68c543be24..ec2904ee69 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -2083,6 +2083,48 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + // References - https://arxiv.org/pdf/1712.05877.pdf + + // Quantized convolution with variable weights is not supported. + if (blobs.empty()) + return false; + + float inputScale = scales[0][0], outputScale = scales[1][0]; + int inputZp = zeropoints[0][0]; + params.set("input_zeropoint", inputZp); + + Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S); + Mat biasQuantized(1, numOutput, CV_32S); + Mat outputMultiplier(1, numOutput, CV_32F); + double realMin, realMax, weightsScale; + + for( int i = 0; i < numOutput; i++ ) + { + // Quantize weights + cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax); + realMin = std::min(realMin, 0.0); + realMax = std::max(realMax, 0.0); + weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127; + weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale); + + // Quantize biases + float biasScale = inputScale * weightsScale; + biasQuantized.at(i) = (int)std::round(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]); + + // Store multiplier + outputMultiplier.at(i) = biasScale / outputScale; + } + + params.blobs.clear(); + params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0]))); + params.blobs.push_back(biasQuantized); + params.blobs.push_back(outputMultiplier); + return true; + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index 9bb5be342f..6dc1813c8b 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -255,6 +255,12 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + return func.tryQuantize(scales, zeropoints, params); + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { @@ -288,6 +294,8 @@ struct BaseFunctor bool tryFuse(Ptr&) { return false; } void getScaleShift(Mat&, Mat&) const {} + + bool tryQuantize(const std::vector>&, const std::vector>&, LayerParams&) { return false; } }; struct ReLUFunctor : public BaseFunctor @@ -436,6 +444,29 @@ struct ReLUFunctor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + if (slope != 0.f) + { + float inpScale = scales[0][0], outScale = scales[1][0]; + int inpZp = zeropoints[0][0], outZp = zeropoints[1][0]; + + Mat lookUpTable(1, 256, CV_8S); + int8_t* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - inpZp); + float y = x >= 0.f ? x : slope*x; + int quantized = outZp + (int)std::round(y/outScale); + table[i+128] = saturate_cast(quantized); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + } + return true; + } + int64 getFLOPSPerElement() const { return 1; } }; @@ -559,6 +590,12 @@ struct ReLU6Functor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + return true; + } + int64 getFLOPSPerElement() const { return 2; } }; @@ -651,6 +688,26 @@ struct TanHFunctor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + float inpScale = scales[0][0], outScale = scales[1][0]; + int inpZp = zeropoints[0][0], outZp = zeropoints[1][0]; + + Mat lookUpTable(1, 256, CV_8S); + int8_t* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - inpZp); + float y = tanh(x); + int quantized = outZp + (int)std::round(y/outScale); + table[i+128] = saturate_cast(quantized); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + return true; + } + int64 getFLOPSPerElement() const { return 1; } }; @@ -743,6 +800,26 @@ struct SwishFunctor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + float inpScale = scales[0][0], outScale = scales[1][0]; + int inpZp = zeropoints[0][0], outZp = zeropoints[1][0]; + + Mat lookUpTable(1, 256, CV_8S); + int8_t* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - inpZp); + float y = x / (1.0f + exp(-x)); + int quantized = outZp + (int)std::round(y/outScale); + table[i+128] = saturate_cast(quantized); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + return true; + } + int64 getFLOPSPerElement() const { return 3; } }; @@ -848,6 +925,28 @@ struct MishFunctor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + float inpScale = scales[0][0], outScale = scales[1][0]; + int inpZp = zeropoints[0][0], outZp = zeropoints[1][0]; + + Mat lookUpTable(1, 256, CV_8S); + int8_t* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - inpZp); + float eX = exp(x); + float n = (eX + 2) * eX; + float y = (x * n) / (n + 2); + int quantized = outZp + (int)std::round(y/outScale); + table[i+128] = saturate_cast(quantized); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + return true; + } + int64 getFLOPSPerElement() const { return 3; } }; @@ -940,6 +1039,26 @@ struct SigmoidFunctor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + float inpScale = scales[0][0], outScale = scales[1][0]; + int inpZp = zeropoints[0][0], outZp = zeropoints[1][0]; + + Mat lookUpTable(1, 256, CV_8S); + int8_t* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - inpZp); + float y = 1.f/(1.f + exp(-x)); + int quantized = outZp + (int)std::round(y/outScale); + table[i+128] = saturate_cast(quantized); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + return true; + } + int64 getFLOPSPerElement() const { return 3; } }; @@ -1032,6 +1151,26 @@ struct ELUFunctor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + float inpScale = scales[0][0], outScale = scales[1][0]; + int inpZp = zeropoints[0][0], outZp = zeropoints[1][0]; + + Mat lookUpTable(1, 256, CV_8S); + int8_t* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - inpZp); + float y = x >= 0.f ? x : exp(x) - 1; + int quantized = outZp + (int)std::round(y/outScale); + table[i+128] = saturate_cast(quantized); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + return true; + } + int64 getFLOPSPerElement() const { return 2; } }; @@ -1130,6 +1269,26 @@ struct AbsValFunctor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + float inpScale = scales[0][0], outScale = scales[1][0]; + int inpZp = zeropoints[0][0], outZp = zeropoints[1][0]; + + Mat lookUpTable(1, 256, CV_8S); + int8_t* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - inpZp); + float y = abs(x); + int quantized = outZp + (int)std::round(y/outScale); + table[i+128] = saturate_cast(quantized); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + return true; + } + int64 getFLOPSPerElement() const { return 1; } }; @@ -1223,6 +1382,26 @@ struct BNLLFunctor : public BaseFunctor } #endif // HAVE_VULKAN + bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) + { + float inpScale = scales[0][0], outScale = scales[1][0]; + int inpZp = zeropoints[0][0], outZp = zeropoints[1][0]; + + Mat lookUpTable(1, 256, CV_8S); + int8_t* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - inpZp); + float y = x > 0 ? x + log(1. + exp(-x)) : log(1. + exp(x)); + int quantized = outZp + (int)std::round(y/outScale); + table[i+128] = saturate_cast(quantized); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + return true; + } + int64 getFLOPSPerElement() const { return 5; } }; diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index a337c48d9e..860560213d 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -864,6 +864,37 @@ public: } #endif // HAVE_DNN_NGRAPH + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + if (op == SUM) + { + std::vector newCoeffs; + float offset = zeropoints[1][0]; + float out_sc = scales[1][0]; + for (int i = 0; i < scales[0].size(); i++) + { + float coeff = coeffs.empty() ? 1.f : coeffs[i]; + float newcoeff = (scales[0][i] * coeff) / out_sc; + newCoeffs.push_back(newcoeff); + offset -= (newcoeff * zeropoints[0][i]); + } + params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size())); + params.set("offset", offset); + return true; + } + else if (op == PROD) + { + std::vector newCoeffs = scales[0]; + newCoeffs[0] /= scales[1][0]; + params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size())); + params.set("offset", zeropoints[1][0]); + params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size())); + return true; + } + return op == MAX; + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp index 7cf01a14fa..8ff862fab0 100644 --- a/modules/dnn/src/layers/flatten_layer.cpp +++ b/modules/dnn/src/layers/flatten_layer.cpp @@ -227,6 +227,11 @@ virtual Ptr initNgraph(const std::vector >& inp } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + return true; + } int _startAxis; int _endAxis; diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 529f3c04fd..28ea7f347f 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -618,6 +618,45 @@ public: } #endif // HAVE_DNN_NGRAPH + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + if (blobs.empty()) + return false; + + int numOutput = blobs[0].size[0]; + float inputScale = scales[0][0], outputScale = scales[1][0]; + int inputZp = zeropoints[0][0]; + + Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S); + Mat biasQuantized(1, numOutput, CV_32S); + Mat outputMultiplier(1, numOutput, CV_32F); + + double realMin, realMax, weightsScale; + for( int i = 0; i < numOutput; i++ ) + { + // Quantize weights + cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax); + realMin = std::min(realMin, 0.0); + realMax = std::max(realMax, 0.0); + weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127; + weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale); + + // Quantize biases + float biasScale = inputScale * weightsScale; + biasQuantized.at(i) = (int)std::round(biasMat.at(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]); + + // Store multiplier + outputMultiplier.at(i) = biasScale / outputScale; + } + + params.blobs.clear(); + params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0]))); + params.blobs.push_back(biasQuantized); + params.blobs.push_back(outputMultiplier); + return true; + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp index d182568795..c1979ce701 100644 --- a/modules/dnn/src/layers/padding_layer.cpp +++ b/modules/dnn/src/layers/padding_layer.cpp @@ -134,6 +134,8 @@ public: cv::convertFp16(paddingValue_fp32, paddingValue_fp16); outputs[0].setTo(paddingValue_fp16[0]); } + else if (inputs_arr.depth() == CV_8S) + outputs[0].setTo(saturate_cast(paddingValue)); else outputs[0].setTo(paddingValue); inputs[0].copyTo(outputs[0](dstRanges)); @@ -264,6 +266,16 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + float outputScale = scales[1][0]; + int outputZp = zeropoints[1][0]; + float padValue = outputZp + std::round(params.get("value", 0)/outputScale); + params.set("value", padValue); + return true; + } + private: std::vector > paddings; // Pairs pad before, pad after. std::vector dstRanges; diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp index c525c3f82f..77c2469c05 100644 --- a/modules/dnn/src/layers/permute_layer.cpp +++ b/modules/dnn/src/layers/permute_layer.cpp @@ -194,6 +194,7 @@ public: #endif } + template class PermuteInvoker : public ParallelLoopBody { public: @@ -229,7 +230,7 @@ public: size_t stripeStart = r.start*stripeSize; size_t stripeEnd = std::min(r.end*stripeSize, orows); - const size_t esz = sizeof(float); + const size_t esz = sizeof(T); size_t ostep0 = out->step[0]/esz, ostep1 = out->step[1]/esz, ostep2 = out->step[2]/esz; const size_t* ord = &order->at(0); size_t istep0 = inp->step[ord[0]]/esz, istep1 = inp->step[ord[1]]/esz, @@ -241,13 +242,13 @@ public: int i1 = (int)(val % n1); int i0 = (int)(val / n1); - const float* inptr_orig = inp->ptr(); - float* outptr_orig = out->ptr(); + const T* inptr_orig = inp->ptr(); + T* outptr_orig = out->ptr(); for( size_t ofs = stripeStart; ofs < stripeEnd; ofs++ ) { - const float* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2; - float* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2; + const T* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2; + T* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2; for( int i3 = 0; i3 < n3; i3++ ) outptr[i3] = inptr[i3*istep3]; @@ -321,7 +322,8 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && + inputs_arr.depth() != CV_8S, forward_ocl(inputs_arr, outputs_arr, internals_arr)) if (inputs_arr.depth() == CV_16S) @@ -365,24 +367,48 @@ public: if( numAxes == 4 ) { int nstripes = getNumThreads(); - PermuteInvoker::run(inp, out, _order, nstripes); + if (inp.type() == CV_8S) + PermuteInvoker::run(inp, out, _order, nstripes); + else + PermuteInvoker::run(inp, out, _order, nstripes); } else { - const float *srcData = inp.ptr(); - float *dstData = out.ptr(); + if (inp.type() == CV_8S) + { + const int8_t *srcData = inp.ptr(); + int8_t *dstData = out.ptr(); - for (i = 0; i < count; ++i) + for (i = 0; i < count; ++i) + { + size_t oldPosition = 0; + size_t newPosition = i; + + for (j = 0; j < numAxes; ++j) + { + oldPosition += (newPosition / newStride[j]) * oldStride[order[j]]; + newPosition %= newStride[j]; + } + dstData[i] = srcData[oldPosition]; + } + } + else { - size_t oldPosition = 0; - size_t newPosition = i; + const float *srcData = inp.ptr(); + float *dstData = out.ptr(); - for (j = 0; j < numAxes; ++j) + for (i = 0; i < count; ++i) { - oldPosition += (newPosition / newStride[j]) * oldStride[order[j]]; - newPosition %= newStride[j]; + size_t oldPosition = 0; + size_t newPosition = i; + + for (j = 0; j < numAxes; ++j) + { + oldPosition += (newPosition / newStride[j]) * oldStride[order[j]]; + newPosition %= newStride[j]; + } + dstData[i] = srcData[oldPosition]; } - dstData[i] = srcData[oldPosition]; } } } @@ -436,6 +462,11 @@ public: } #endif // HAVE_VULKAN + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + return true; + } size_t _count; std::vector _order; diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index b8e2cfdf8f..7653e53668 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -1327,6 +1327,23 @@ public: return true; } + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + if (type == MAX && !computeMaxIdx) + { + return true; + } + else if (type == AVE || type == SUM) + { + float multiplier = scales[0][0] / scales[1][0]; + params.set("multiplier", multiplier); + params.set("input_zeropoint", zeropoints[0][0]); + return true; + } + return false; + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp index da1c61adac..797df4819d 100644 --- a/modules/dnn/src/layers/reorg_layer.cpp +++ b/modules/dnn/src/layers/reorg_layer.cpp @@ -231,6 +231,11 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + return true; + } virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp index ab8f41c7b6..4c10d155c8 100644 --- a/modules/dnn/src/layers/reshape_layer.cpp +++ b/modules/dnn/src/layers/reshape_layer.cpp @@ -343,6 +343,11 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + return true; + } private: std::vector outShapes; diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp index a5c268214e..001db24a2d 100644 --- a/modules/dnn/src/layers/scale_layer.cpp +++ b/modules/dnn/src/layers/scale_layer.cpp @@ -344,6 +344,14 @@ public: shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat(); } + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size())); + params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size())); + return true; + } + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { diff --git a/modules/dnn/src/layers/shuffle_channel_layer.cpp b/modules/dnn/src/layers/shuffle_channel_layer.cpp index 6db74d1abd..2a698d270f 100644 --- a/modules/dnn/src/layers/shuffle_channel_layer.cpp +++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp @@ -147,6 +147,12 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + return true; + } + private: Ptr permute; std::vector permuteInpShape, permuteOutShape; diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 54e2340387..9efd95cf48 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -531,7 +531,12 @@ public: { std::vector inpIdx(dimsNum, 0); std::vector outIdx(dimsNum, 0); - getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx); + if (inpMat.type() == CV_16S) + getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx); + else if (inpMat.type() == CV_8S) + getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx); + else + getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx); } } } @@ -647,8 +652,20 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + const int numOutputs = scales[1].size(); + for (int i = 0; i < numOutputs; i++) + { + if (scales[1][i] != scales[0][0]) + return false; + } + return true; + } private: + template void getSliceRecursive(const Mat &inpMat, std::vector &inpIdx, const std::vector &sliceRanges, const std::vector &sliceSteps, int dim, int dimsNum, @@ -658,8 +675,6 @@ private: int end = sliceRanges[dim].end; int step = !sliceSteps.empty() ? sliceSteps[dim] : 1; - const bool is32F = inpMat.depth() == CV_32F; - // TODO optimization is required (for 2D tail case at least) for (int k = begin, j = 0; k < end; k += step, j++) { @@ -667,14 +682,9 @@ private: outIdx[dim] = j; if (dim + 1 < dimsNum) - getSliceRecursive(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx); + getSliceRecursive(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx); else - { - if (is32F) - outputs.at(outIdx.data()) = inpMat.at(inpIdx.data()); - else - outputs.at(outIdx.data()) = inpMat.at(inpIdx.data()); // 16F emulation - } + outputs.at(outIdx.data()) = inpMat.at(inpIdx.data()); } } diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp index 546c1017ad..e937e98f8c 100644 --- a/modules/dnn/src/layers/softmax_layer.cpp +++ b/modules/dnn/src/layers/softmax_layer.cpp @@ -374,6 +374,22 @@ public: } #endif // HAVE_DNN_NGRAPH + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + float inpScale = scales[0][0]; + Mat lookUpTable(1, 256, CV_32F); + float* table = lookUpTable.ptr(); + for (int i = -128; i < 128; i++) + { + float x = inpScale*(i - 127); // ensures exp(x) is always between (0, 1) + table[i+128] = std::exp(x); + } + params.blobs.clear(); + params.blobs.push_back(lookUpTable); + return true; + } + int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const CV_OVERRIDE { diff --git a/modules/dnn/src/layers/split_layer.cpp b/modules/dnn/src/layers/split_layer.cpp index b025d5ff1e..2a44176152 100644 --- a/modules/dnn/src/layers/split_layer.cpp +++ b/modules/dnn/src/layers/split_layer.cpp @@ -117,6 +117,17 @@ public: } #endif + virtual bool tryQuantize(const std::vector > &scales, + const std::vector > &zeropoints, LayerParams& params) CV_OVERRIDE + { + const int numOutputs = scales[1].size(); + for (int i = 0; i < numOutputs; i++) + { + if (scales[1][i] != scales[0][0]) + return false; + } + return true; + } }; Ptr SplitLayer::create(const LayerParams& params) diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp new file mode 100644 index 0000000000..1fcb1d0dba --- /dev/null +++ b/modules/dnn/test/test_int8_layers.cpp @@ -0,0 +1,1220 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "test_precomp.hpp" +#include "npy_blob.hpp" +#include +#include +namespace opencv_test { namespace { + +template +static std::string _tf(TString filename) +{ + return (getOpenCVExtraDir() + "dnn/") + filename; +} + +class Test_Int8_layers : public DNNTestLayer +{ +public: + void testLayer(const String& basename, const String& importer, double l1, double lInf, + int numInps = 1, int numOuts = 1, bool useCaffeModel = false, + bool useCommonInputBlob = true, bool hasText = false) + { + CV_Assert_N(numInps >= 1, numInps <= 10, numOuts >= 1, numOuts <= 10); + std::vector inps(numInps), inps_int8(numInps); + std::vector refs(numOuts), outs_int8(numOuts), outs_dequantized(numOuts); + std::vector inputScale, outputScale; + std::vector inputZp, outputZp; + String inpPath, outPath; + Net net, qnet; + + if (importer == "Caffe") + { + String prototxt = _tf("layers/" + basename + ".prototxt"); + String caffemodel = _tf("layers/" + basename + ".caffemodel"); + net = readNetFromCaffe(prototxt, useCaffeModel ? caffemodel : String()); + + inpPath = _tf("layers/" + (useCommonInputBlob ? "blob" : basename + ".input")); + outPath = _tf("layers/" + basename); + } + else if (importer == "TensorFlow") + { + String netPath = _tf("tensorflow/" + basename + "_net.pb"); + String netConfig = hasText ? _tf("tensorflow/" + basename + "_net.pbtxt") : ""; + net = readNetFromTensorflow(netPath, netConfig); + + inpPath = _tf("tensorflow/" + basename + "_in"); + outPath = _tf("tensorflow/" + basename + "_out"); + } + else if (importer == "ONNX") + { + String onnxmodel = _tf("onnx/models/" + basename + ".onnx"); + net = readNetFromONNX(onnxmodel); + + inpPath = _tf("onnx/data/input_" + basename); + outPath = _tf("onnx/data/output_" + basename); + } + ASSERT_FALSE(net.empty()); + net.setPreferableBackend(backend); + net.setPreferableTarget(target); + + for (int i = 0; i < numInps; i++) + inps[i] = blobFromNPY(inpPath + ((numInps > 1) ? cv::format("_%d.npy", i) : ".npy")); + + for (int i = 0; i < numOuts; i++) + refs[i] = blobFromNPY(outPath + ((numOuts > 1) ? cv::format("_%d.npy", i) : ".npy")); + + qnet = net.quantize(inps, CV_8S, CV_8S); + qnet.getInputDetails(inputScale, inputZp); + qnet.getOutputDetails(outputScale, outputZp); + + // Quantize inputs to int8 + // int8_value = float_value/scale + zero-point + for (int i = 0; i < numInps; i++) + { + inps[i].convertTo(inps_int8[i], CV_8S, 1.f/inputScale[i], inputZp[i]); + String inp_name = numInps > 1 ? (importer == "Caffe" ? cv::format("input_%d", i) : cv::format("%d", i)) : ""; + qnet.setInput(inps_int8[i], inp_name); + } + qnet.forward(outs_int8); + + // Dequantize outputs and compare with reference outputs + // float_value = scale*(int8_value - zero-point) + for (int i = 0; i < numOuts; i++) + { + outs_int8[i].convertTo(outs_dequantized[i], CV_32F, outputScale[i], -(outputScale[i] * outputZp[i])); + normAssert(refs[i], outs_dequantized[i], "", l1, lInf); + } + } +}; + +TEST_P(Test_Int8_layers, Convolution1D) +{ + testLayer("conv1d", "ONNX", 0.00302, 0.00909); + testLayer("conv1d_bias", "ONNX", 0.00306, 0.00948); +} + +TEST_P(Test_Int8_layers, Convolution2D) +{ + testLayer("layer_convolution", "Caffe", 0.0174, 0.0758, 1, 1, true); + testLayer("single_conv", "TensorFlow", 0.00413, 0.02201); + testLayer("depthwise_conv2d", "TensorFlow", 0.0388, 0.169); + testLayer("atrous_conv2d_valid", "TensorFlow", 0.0193, 0.0633); + testLayer("atrous_conv2d_same", "TensorFlow", 0.0185, 0.1322); + testLayer("keras_atrous_conv2d_same", "TensorFlow", 0.0056, 0.0244); + testLayer("convolution", "ONNX", 0.0052, 0.01516); + testLayer("two_convolution", "ONNX", 0.00295, 0.00840); +} + +TEST_P(Test_Int8_layers, Convolution3D) +{ + testLayer("conv3d", "TensorFlow", 0.00734, 0.02434); + testLayer("conv3d", "ONNX", 0.00353, 0.00941); + testLayer("conv3d_bias", "ONNX", 0.00129, 0.00249); +} + +TEST_P(Test_Int8_layers, Flatten) +{ + testLayer("flatten", "TensorFlow", 0.0036, 0.0069, 1, 1, false, true, true); + testLayer("unfused_flatten", "TensorFlow", 0.0014, 0.0028); + testLayer("unfused_flatten_unknown_batch", "TensorFlow", 0.0043, 0.0051); +} + +TEST_P(Test_Int8_layers, Padding) +{ + testLayer("padding_valid", "TensorFlow", 0.0026, 0.0064); + testLayer("padding_same", "TensorFlow", 0.0081, 0.032); + testLayer("spatial_padding", "TensorFlow", 0.0078, 0.028); + testLayer("mirror_pad", "TensorFlow", 0.0064, 0.013); + testLayer("pad_and_concat", "TensorFlow", 0.0021, 0.0098); + testLayer("padding", "ONNX", 0.0005, 0.0069); + testLayer("ReflectionPad2d", "ONNX", 0.00062, 0.0018); + testLayer("ZeroPad2d", "ONNX", 0.00037, 0.0018); +} + +TEST_P(Test_Int8_layers, AvePooling) +{ + testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075); + testLayer("ave_pool_same", "TensorFlow", 0.00153, 0.0041); + testLayer("average_pooling_1d", "ONNX", 0.002, 0.0048); + testLayer("average_pooling", "ONNX", 0.0014, 0.0032); + testLayer("average_pooling_dynamic_axes", "ONNX", 0.0014, 0.006); + + if (target != DNN_TARGET_CPU) + throw SkipTestException("Only CPU is supported"); + testLayer("ave_pool3d", "TensorFlow", 0.00175, 0.0047); + testLayer("ave_pool3d", "ONNX", 0.00063, 0.0016); +} + +TEST_P(Test_Int8_layers, MaxPooling) +{ + testLayer("pool_conv_1d", "ONNX", 0.0006, 0.0015); + if (target != DNN_TARGET_CPU) + throw SkipTestException("Only CPU is supported"); + testLayer("pool_conv_3d", "ONNX", 0.0033, 0.0124); + + /* All the below tests have MaxPooling as last layer, so computeMaxIdx is set to true + which is not supported by int8 maxpooling + testLayer("layer_pooling_max", "Caffe", 0.0021, 0.004); + testLayer("max_pool_even", "TensorFlow", 0.0048, 0.0139); + testLayer("max_pool_odd_valid", "TensorFlow", 0.0043, 0.012); + testLayer("conv_pool_nchw", "TensorFlow", 0.007, 0.025); + testLayer("max_pool3d", "TensorFlow", 0.0025, 0.0058); + testLayer("maxpooling_1d", "ONNX", 0.0018, 0.0037); + testLayer("two_maxpooling_1d", "ONNX", 0.0037, 0.0052); + testLayer("maxpooling", "ONNX", 0.0034, 0.0065); + testLayer("two_maxpooling", "ONNX", 0.0025, 0.0052); + testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);*/ +} + +TEST_P(Test_Int8_layers, Reduce) +{ + testLayer("reduce_mean", "TensorFlow", 0.0005, 0.0014); + testLayer("reduce_mean", "ONNX", 0.00062, 0.0014); + testLayer("reduce_mean_axis1", "ONNX", 0.00032, 0.0007); + testLayer("reduce_mean_axis2", "ONNX", 0.00033, 0.001); + + testLayer("reduce_sum", "TensorFlow", 0.015, 0.031); + testLayer("reduce_sum_channel", "TensorFlow", 0.008, 0.019); + testLayer("sum_pool_by_axis", "TensorFlow", 0.012, 0.032); + testLayer("reduce_sum", "ONNX", 0.0025, 0.0048); + + testLayer("reduce_max", "ONNX", 0, 0); + testLayer("reduce_max_axis_0", "ONNX", 0.0042, 0.007); + testLayer("reduce_max_axis_1", "ONNX", 0.0018, 0.0036); + + if (target != DNN_TARGET_CPU) + throw SkipTestException("Only CPU is supported"); + testLayer("reduce_mean3d", "ONNX", 0.00048, 0.0016); +} + +TEST_P(Test_Int8_layers, ReLU) +{ + testLayer("layer_relu", "Caffe", 0.0005, 0.002); + testLayer("ReLU", "ONNX", 0.0012, 0.0047); +} + +TEST_P(Test_Int8_layers, LeakyReLU) +{ + testLayer("leaky_relu", "TensorFlow", 0.0002, 0.0004); +} + +TEST_P(Test_Int8_layers, ReLU6) +{ + testLayer("keras_relu6", "TensorFlow", 0.0018, 0.0062); + testLayer("keras_relu6", "TensorFlow", 0.0018, 0.0062, 1, 1, false, true, true); + testLayer("clip_by_value", "TensorFlow", 0.0009, 0.002); + testLayer("clip", "ONNX", 0.00006, 0.00037); +} + +TEST_P(Test_Int8_layers, Sigmoid) +{ + testLayer("maxpooling_sigmoid", "ONNX", 0.0011, 0.0032); + testLayer("maxpooling_sigmoid_dynamic_axes", "ONNX", 0.0011, 0.0032); + testLayer("maxpooling_sigmoid_1d", "ONNX", 0.0011, 0.0037); +} + +TEST_P(Test_Int8_layers, Mish) +{ + testLayer("mish", "ONNX", 0.0015, 0.0025); +} + +TEST_P(Test_Int8_layers, Softmax) +{ + testLayer("layer_softmax", "Caffe", 0.0011, 0.0036); + testLayer("keras_softmax", "TensorFlow", 0.00093, 0.0027); + testLayer("slim_softmax", "TensorFlow", 0.0016, 0.0034); + testLayer("slim_softmax_v2", "TensorFlow", 0.0029, 0.017); + testLayer("softmax", "ONNX", 0.0016, 0.0028); + testLayer("log_softmax", "ONNX", 0.014, 0.025); + testLayer("softmax_unfused", "ONNX", 0.0009, 0.0021); +} + +TEST_P(Test_Int8_layers, Concat) +{ + testLayer("layer_concat_shared_input", "Caffe", 0.0076, 0.029, 1, 1, true, false); + testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017); + testLayer("keras_pad_concat", "TensorFlow", 0.0032, 0.0089); + testLayer("concat_3d", "TensorFlow", 0.005, 0.014); + testLayer("concatenation", "ONNX", 0.0032, 0.009); +} + +TEST_P(Test_Int8_layers, BatchNorm) +{ + testLayer("layer_batch_norm", "Caffe", 0.0061, 0.019, 1, 1, true); + testLayer("fused_batch_norm", "TensorFlow", 0.0063, 0.02); + testLayer("batch_norm_text", "TensorFlow", 0.0048, 0.013, 1, 1, false, true, true); + testLayer("unfused_batch_norm", "TensorFlow", 0.0076, 0.019); + testLayer("fused_batch_norm_no_gamma", "TensorFlow", 0.0067, 0.015); + testLayer("unfused_batch_norm_no_gamma", "TensorFlow", 0.0123, 0.044); + testLayer("switch_identity", "TensorFlow", 0.0035, 0.011); + testLayer("batch_norm3d", "TensorFlow", 0.0077, 0.02); + testLayer("batch_norm", "ONNX", 0.0012, 0.0049); + testLayer("batch_norm_3d", "ONNX", 0.0039, 0.012); + testLayer("frozenBatchNorm2d", "ONNX", 0.001, 0.0018); + testLayer("batch_norm_subgraph", "ONNX", 0.0049, 0.0098); +} + +TEST_P(Test_Int8_layers, Scale) +{ + testLayer("batch_norm", "TensorFlow", 0.0028, 0.0098); + testLayer("scale", "ONNX", 0.0025, 0.0071); + testLayer("expand_hw", "ONNX", 0.0012, 0.0012); + testLayer("flatten_const", "ONNX", 0.0024, 0.0048); +} + +TEST_P(Test_Int8_layers, InnerProduct) +{ + testLayer("layer_inner_product", "Caffe", 0.005, 0.02, 1, 1, true); + testLayer("matmul", "TensorFlow", 0.0061, 0.019); + testLayer("nhwc_transpose_reshape_matmul", "TensorFlow", 0.0009, 0.0091); + testLayer("nhwc_reshape_matmul", "TensorFlow", 0.03, 0.071); + testLayer("matmul_layout", "TensorFlow", 0.035, 0.06); + testLayer("tf2_dense", "TensorFlow", 0, 0); + testLayer("matmul_add", "ONNX", 0.041, 0.082); + testLayer("linear", "ONNX", 0.0018, 0.0029); + testLayer("constant", "ONNX", 0.00021, 0.0006); + testLayer("lin_with_constant", "ONNX", 0.0011, 0.0016); +} + +TEST_P(Test_Int8_layers, Reshape) +{ + testLayer("reshape_layer", "TensorFlow", 0.0032, 0.0082); + testLayer("reshape_nchw", "TensorFlow", 0.0089, 0.029); + testLayer("reshape_conv", "TensorFlow", 0.035, 0.054); + testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078); + testLayer("reshape_as_shape", "TensorFlow", 0.0014, 0.0028); + testLayer("reshape_no_reorder", "TensorFlow", 0.0014, 0.0028); + testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, 0.014); + testLayer("dynamic_reshape", "ONNX", 0.0047, 0.0079); + testLayer("dynamic_reshape_opset_11", "ONNX", 0.0048, 0.0081); + testLayer("flatten_by_prod", "ONNX", 0.0048, 0.0081); + testLayer("squeeze", "ONNX", 0.0048, 0.0081); + testLayer("unsqueeze", "ONNX", 0.0033, 0.0053); + testLayer("squeeze_and_conv_dynamic_axes", "ONNX", 0.0054, 0.0154); + testLayer("unsqueeze_and_conv_dynamic_axes", "ONNX", 0.0037, 0.0151); +} + +TEST_P(Test_Int8_layers, Permute) +{ + testLayer("tf2_permute_nhwc_ncwh", "TensorFlow", 0.0028, 0.006); + testLayer("transpose", "ONNX", 0.0015, 0.0046); +} + +TEST_P(Test_Int8_layers, Identity) +{ + testLayer("expand_batch", "ONNX", 0.0027, 0.0036); + testLayer("expand_channels", "ONNX", 0.0013, 0.0019); + testLayer("expand_neg_batch", "ONNX", 0.00071, 0.0019); +} + +TEST_P(Test_Int8_layers, Slice) +{ + testLayer("split", "TensorFlow", 0.0033, 0.0056); + testLayer("slice_4d", "TensorFlow", 0.003, 0.0073); + testLayer("strided_slice", "TensorFlow", 0.008, 0.0142); + testLayer("slice", "ONNX", 0.0046, 0.0077); + testLayer("slice_dynamic_axes", "ONNX", 0.0039, 0.0084); + testLayer("slice_opset_11_steps_2d", "ONNX", 0.0052, 0.0124); + testLayer("slice_opset_11_steps_3d", "ONNX", 0.0068, 0.014); + testLayer("slice_opset_11_steps_4d", "ONNX", 0.0041, 0.008); + testLayer("slice_opset_11_steps_5d", "ONNX", 0.0085, 0.021); +} + +TEST_P(Test_Int8_layers, Dropout) +{ + testLayer("layer_dropout", "Caffe", 0.0021, 0.004); + testLayer("dropout", "ONNX", 0.0029, 0.004); +} + +TEST_P(Test_Int8_layers, Eltwise) +{ + testLayer("layer_eltwise", "Caffe", 0.062, 0.15); + testLayer("conv_2_inps", "Caffe", 0.0086, 0.0232, 2, 1, true, false); + testLayer("eltwise_sub", "TensorFlow", 0.015, 0.047); + testLayer("eltwise_add_vec", "TensorFlow", 0.037, 0.21); // tflite 0.0095, 0.0365 + testLayer("eltwise_mul_vec", "TensorFlow", 0.173, 1.14); // tflite 0.0028, 0.017 + testLayer("channel_broadcast", "TensorFlow", 0.0025, 0.0063); + testLayer("split_equals", "TensorFlow", 0.02, 0.065); + testLayer("mul", "ONNX", 0.0039, 0.014); + testLayer("split_max", "ONNX", 0.004, 0.012); +} + +INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_layers, dnnBackendsAndTargets()); + +class Test_Int8_nets : public DNNTestLayer +{ +public: + void testClassificationNet(Net baseNet, const Mat& blob, const Mat& ref, double l1, double lInf) + { + Net qnet = baseNet.quantize(blob, CV_32F, CV_32F); + qnet.setPreferableBackend(backend); + qnet.setPreferableTarget(target); + + qnet.setInput(blob); + Mat out = qnet.forward(); + normAssert(ref, out, "", l1, lInf); + } + + void testDetectionNet(Net baseNet, const Mat& blob, const Mat& ref, + double confThreshold, double scoreDiff, double iouDiff) + { + Net qnet = baseNet.quantize(blob, CV_32F, CV_32F); + qnet.setPreferableBackend(backend); + qnet.setPreferableTarget(target); + + qnet.setInput(blob); + Mat out = qnet.forward(); + normAssertDetections(ref, out, "", confThreshold, scoreDiff, iouDiff); + } + + void testFaster(Net baseNet, const Mat& ref, double confThreshold, double scoreDiff, double iouDiff) + { + Mat inp = imread(_tf("dog416.png")); + resize(inp, inp, Size(800, 600)); + Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false); + Mat imInfo = (Mat_(1, 3) << inp.rows, inp.cols, 1.6f); + + Net qnet = baseNet.quantize(std::vector{blob, imInfo}, CV_32F, CV_32F); + qnet.setPreferableBackend(backend); + qnet.setPreferableTarget(target); + + qnet.setInput(blob, "data"); + qnet.setInput(imInfo, "im_info"); + Mat out = qnet.forward(); + normAssertDetections(ref, out, "", confThreshold, scoreDiff, iouDiff); + } + + void testONNXNet(const String& basename, double l1, double lInf, bool useSoftmax = false) + { + String onnxmodel = findDataFile("dnn/onnx/models/" + basename + ".onnx", false); + + Mat blob = readTensorFromONNX(findDataFile("dnn/onnx/data/input_" + basename + ".pb")); + Mat ref = readTensorFromONNX(findDataFile("dnn/onnx/data/output_" + basename + ".pb")); + Net baseNet = readNetFromONNX(onnxmodel); + baseNet.setPreferableBackend(backend); + baseNet.setPreferableTarget(target); + + Net qnet = baseNet.quantize(blob, CV_32F, CV_32F); + qnet.setInput(blob); + Mat out = qnet.forward(); + + if (useSoftmax) + { + LayerParams lp; + Net netSoftmax; + netSoftmax.addLayerToPrev("softmaxLayer", "Softmax", lp); + netSoftmax.setPreferableBackend(DNN_BACKEND_OPENCV); + + netSoftmax.setInput(out); + out = netSoftmax.forward(); + + netSoftmax.setInput(ref); + ref = netSoftmax.forward(); + } + + normAssert(ref, out, "", l1, lInf); + } + + void testDarknetModel(const std::string& cfg, const std::string& weights, + const cv::Mat& ref, double scoreDiff, double iouDiff, + float confThreshold = 0.24, float nmsThreshold = 0.4) + { + CV_Assert(ref.cols == 7); + std::vector > refClassIds; + std::vector > refScores; + std::vector > refBoxes; + for (int i = 0; i < ref.rows; ++i) + { + int batchId = static_cast(ref.at(i, 0)); + int classId = static_cast(ref.at(i, 1)); + float score = ref.at(i, 2); + float left = ref.at(i, 3); + float top = ref.at(i, 4); + float right = ref.at(i, 5); + float bottom = ref.at(i, 6); + Rect2d box(left, top, right - left, bottom - top); + if (batchId >= refClassIds.size()) + { + refClassIds.resize(batchId + 1); + refScores.resize(batchId + 1); + refBoxes.resize(batchId + 1); + } + refClassIds[batchId].push_back(classId); + refScores[batchId].push_back(score); + refBoxes[batchId].push_back(box); + } + + Mat img1 = imread(_tf("dog416.png")); + Mat img2 = imread(_tf("street.png")); + std::vector samples(2); + samples[0] = img1; samples[1] = img2; + + // determine test type, whether batch or single img + int batch_size = refClassIds.size(); + CV_Assert(batch_size == 1 || batch_size == 2); + samples.resize(batch_size); + + Mat inp = blobFromImages(samples, 1.0/255, Size(416, 416), Scalar(), true, false); + + Net baseNet = readNetFromDarknet(findDataFile("dnn/" + cfg), findDataFile("dnn/" + weights, false)); + Net qnet = baseNet.quantize(inp, CV_32F, CV_32F); + qnet.setPreferableBackend(backend); + qnet.setPreferableTarget(target); + qnet.setInput(inp); + std::vector outs; + qnet.forward(outs, qnet.getUnconnectedOutLayersNames()); + + for (int b = 0; b < batch_size; ++b) + { + std::vector classIds; + std::vector confidences; + std::vector boxes; + for (int i = 0; i < outs.size(); ++i) + { + Mat out; + if (batch_size > 1){ + // get the sample slice from 3D matrix (batch, box, classes+5) + Range ranges[3] = {Range(b, b+1), Range::all(), Range::all()}; + out = outs[i](ranges).reshape(1, outs[i].size[1]); + }else{ + out = outs[i]; + } + for (int j = 0; j < out.rows; ++j) + { + Mat scores = out.row(j).colRange(5, out.cols); + double confidence; + Point maxLoc; + minMaxLoc(scores, 0, &confidence, 0, &maxLoc); + + if (confidence > confThreshold) { + float* detection = out.ptr(j); + double centerX = detection[0]; + double centerY = detection[1]; + double width = detection[2]; + double height = detection[3]; + boxes.push_back(Rect2d(centerX - 0.5 * width, centerY - 0.5 * height, + width, height)); + confidences.push_back(confidence); + classIds.push_back(maxLoc.x); + } + } + } + + // here we need NMS of boxes + std::vector indices; + NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices); + + std::vector nms_classIds; + std::vector nms_confidences; + std::vector nms_boxes; + + for (size_t i = 0; i < indices.size(); ++i) + { + int idx = indices[i]; + Rect2d box = boxes[idx]; + float conf = confidences[idx]; + int class_id = classIds[idx]; + nms_boxes.push_back(box); + nms_confidences.push_back(conf); + nms_classIds.push_back(class_id); + } + + if (cvIsNaN(iouDiff)) + { + if (b == 0) + std::cout << "Skip accuracy checks" << std::endl; + continue; + } + + normAssertDetections(refClassIds[b], refScores[b], refBoxes[b], nms_classIds, nms_confidences, nms_boxes, + format("batch size %d, sample %d\n", batch_size, b).c_str(), confThreshold, scoreDiff, iouDiff); + } + } +}; + +TEST_P(Test_Int8_nets, AlexNet) +{ +#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL) + applyTestTag(CV_TEST_TAG_MEMORY_2GB); +#else + applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); +#endif + if (backend != DNN_BACKEND_OPENCV) + throw SkipTestException("Only OpenCV backend is supported"); + + Net net = readNetFromCaffe(findDataFile("dnn/bvlc_alexnet.prototxt"), + findDataFile("dnn/bvlc_alexnet.caffemodel", false)); + + Mat inp = imread(_tf("grace_hopper_227.png")); + Mat blob = blobFromImage(inp, 1.0, Size(227, 227), Scalar(), false); + Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy")); + + float l1 = 1e-4, lInf = 0.003; + testClassificationNet(net, blob, ref, l1, lInf); +} + +TEST_P(Test_Int8_nets, GoogLeNet) +{ + Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"), + findDataFile("dnn/bvlc_googlenet.caffemodel", false)); + + std::vector inpMats; + inpMats.push_back( imread(_tf("googlenet_0.png")) ); + inpMats.push_back( imread(_tf("googlenet_1.png")) ); + Mat blob = blobFromImages(inpMats, 1.0, Size(224, 224), Scalar(), false); + Mat ref = blobFromNPY(_tf("googlenet_prob.npy")); + + float l1 = 2e-4, lInf = 0.06; + testClassificationNet(net, blob, ref, l1, lInf); +} + +TEST_P(Test_Int8_nets, ResNet50) +{ + applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); + if (backend != DNN_BACKEND_OPENCV) + throw SkipTestException("Only OpenCV backend is supported"); + + Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt"), + findDataFile("dnn/ResNet-50-model.caffemodel", false)); + + Mat inp = imread(_tf("googlenet_0.png")); + Mat blob = blobFromImage(inp, 1.0, Size(224, 224), Scalar(), false); + Mat ref = blobFromNPY(_tf("resnet50_prob.npy")); + + float l1 = 3e-4, lInf = 0.035; + testClassificationNet(net, blob, ref, l1, lInf); +} + +TEST_P(Test_Int8_nets, DenseNet121) +{ + applyTestTag(CV_TEST_TAG_MEMORY_512MB); + + Net net = readNetFromCaffe(findDataFile("dnn/DenseNet_121.prototxt", false), + findDataFile("dnn/DenseNet_121.caffemodel", false)); + + Mat inp = imread(_tf("dog416.png")); + Mat blob = blobFromImage(inp, 1.0 / 255.0, Size(224, 224), Scalar(), true, true); + Mat ref = blobFromNPY(_tf("densenet_121_output.npy")); + + float l1 = 0.76, lInf = 3.31; // seems wrong + testClassificationNet(net, blob, ref, l1, lInf); +} + +TEST_P(Test_Int8_nets, SqueezeNet_v1_1) +{ + if(target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + + Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt"), + findDataFile("dnn/squeezenet_v1.1.caffemodel", false)); + + Mat inp = imread(_tf("googlenet_0.png")); + Mat blob = blobFromImage(inp, 1.0, Size(227, 227), Scalar(), false, true); + Mat ref = blobFromNPY(_tf("squeezenet_v1.1_prob.npy")); + + float l1 = 3e-4, lInf = 0.056; + testClassificationNet(net, blob, ref, l1, lInf); +} + +TEST_P(Test_Int8_nets, CaffeNet) +{ +#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32)) + applyTestTag(CV_TEST_TAG_MEMORY_2GB); +#else + applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); +#endif + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD + && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif + float l1 = 4e-5, lInf = 0.0025; + testONNXNet("caffenet", l1, lInf); +} + +TEST_P(Test_Int8_nets, RCNN_ILSVRC13) +{ +#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32)) + applyTestTag(CV_TEST_TAG_MEMORY_2GB); +#else + applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); +#endif + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD + && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif + float l1 = 0.02, lInf = 0.042; + testONNXNet("rcnn_ilsvrc13", l1, lInf); +} + +TEST_P(Test_Int8_nets, Inception_v2) +{ + testONNXNet("inception_v2", default_l1, default_lInf, true); +} + +TEST_P(Test_Int8_nets, MobileNet_v2) +{ + testONNXNet("mobilenetv2", default_l1, default_lInf, true); +} + +TEST_P(Test_Int8_nets, Shufflenet) +{ + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) + { + if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER); + if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER); + if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER); + } + testONNXNet("shufflenet", default_l1, default_lInf); +} + +TEST_P(Test_Int8_nets, MobileNet_SSD) +{ + Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy.prototxt", false), + findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false)); + + Mat inp = imread(_tf("street.png")); + Mat blob = blobFromImage(inp, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); + Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy")); + + float confThreshold = FLT_MIN, scoreDiff = 0.059, iouDiff = 0.11; + testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, MobileNet_v1_SSD) +{ + Net net = readNetFromTensorflow(findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", false), + findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt")); + + Mat inp = imread(_tf("dog416.png")); + Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false); + Mat ref = blobFromNPY(_tf("tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy")); + + float confThreshold = 0.5, scoreDiff = 0.034, iouDiff = 0.13; + testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, MobileNet_v1_SSD_PPN) +{ +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16)) + applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, + CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif + + Net net = readNetFromTensorflow(findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pb", false), + findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pbtxt")); + + Mat inp = imread(_tf("dog416.png")); + Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false); + Mat ref = blobFromNPY(_tf("tensorflow/ssd_mobilenet_v1_ppn_coco.detection_out.npy")); + + float confThreshold = 0.51, scoreDiff = 0.04, iouDiff = 0.06; + testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, Inception_v2_SSD) +{ + applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD && + getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif + + Net net = readNetFromTensorflow(findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pb", false), + findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pbtxt")); + + Mat inp = imread(_tf("street.png")); + Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false); + Mat ref = (Mat_(5, 7) << 0, 1, 0.90176028, 0.19872092, 0.36311883, 0.26461923, 0.63498729, + 0, 3, 0.93569964, 0.64865261, 0.45906419, 0.80675775, 0.65708131, + 0, 3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015, + 0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527, + 0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384); + + float confThreshold = 0.5, scoreDiff = 0.0114, iouDiff = 0.22; + testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, opencv_face_detector) +{ + Net net = readNetFromCaffe(findDataFile("dnn/opencv_face_detector.prototxt"), + findDataFile("dnn/opencv_face_detector.caffemodel", false)); + + Mat inp = imread(findDataFile("gpu/lbpcascade/er.png")); + Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false); + Mat ref = (Mat_(6, 7) << 0, 1, 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631, + 0, 1, 0.9934696, 0.2831718, 0.50738752, 0.345781, 0.5985168, + 0, 1, 0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290, + 0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477, + 0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494, + 0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801); + + float confThreshold = 0.5, scoreDiff = 0.002, iouDiff = 0.21; + testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, EfficientDet) +{ + if (target != DNN_TARGET_CPU) + { + if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL); + if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); + } + Net net = readNetFromTensorflow(findDataFile("dnn/efficientdet-d0.pb", false), + findDataFile("dnn/efficientdet-d0.pbtxt")); + + Mat inp = imread(_tf("dog416.png")); + Mat blob = blobFromImage(inp, 1.0/255, Size(512, 512), Scalar(123.675, 116.28, 103.53)); + Mat ref = (Mat_(3, 7) << 0, 1, 0.8437444, 0.153996080160141, 0.20534580945968628, 0.7463544607162476, 0.7414066195487976, + 0, 17, 0.8245924, 0.16657517850399017, 0.3996818959712982, 0.4111558794975281, 0.9306337833404541, + 0, 7, 0.8039304, 0.6118435263633728, 0.13175517320632935, 0.9065558314323425, 0.2943994700908661); + + float confThreshold = 0.65, scoreDiff = 0.17, iouDiff = 0.18; + testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, FasterRCNN_resnet50) +{ + applyTestTag( + (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB), + CV_TEST_TAG_LONG, + CV_TEST_TAG_DEBUG_VERYLONG + ); + +#ifdef INF_ENGINE_RELEASE + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && + (INF_ENGINE_VER_MAJOR_LT(2019020000) || target != DNN_TARGET_CPU)) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + + if (INF_ENGINE_VER_MAJOR_GT(2019030000) && + backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH); +#endif + + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH); + + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + + if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); + + Net net = readNetFromTensorflow(findDataFile("dnn/faster_rcnn_resnet50_coco_2018_01_28.pb", false), + findDataFile("dnn/faster_rcnn_resnet50_coco_2018_01_28.pbtxt")); + + Mat inp = imread(_tf("dog416.png")); + Mat blob = blobFromImage(inp, 1.0, Size(800, 600), Scalar(), true, false); + Mat ref = blobFromNPY(_tf("tensorflow/faster_rcnn_resnet50_coco_2018_01_28.detection_out.npy")); + + float confThreshold = 0.5, scoreDiff = 0.025, iouDiff = 0.15; + testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, FasterRCNN_inceptionv2) +{ + applyTestTag( + (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB), + CV_TEST_TAG_LONG, + CV_TEST_TAG_DEBUG_VERYLONG + ); + +#ifdef INF_ENGINE_RELEASE + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && + (INF_ENGINE_VER_MAJOR_LT(2019020000) || target != DNN_TARGET_CPU)) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + + if (INF_ENGINE_VER_MAJOR_GT(2019030000) && + backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH); +#endif + + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH); + + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + + if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); + + Net net = readNetFromTensorflow(findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", false), + findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt")); + + Mat inp = imread(_tf("dog416.png")); + Mat blob = blobFromImage(inp, 1.0, Size(800, 600), Scalar(), true, false); + Mat ref = blobFromNPY(_tf("tensorflow/faster_rcnn_inception_v2_coco_2018_01_28.detection_out.npy")); + + float confThreshold = 0.5, scoreDiff = 0.21, iouDiff = 0.1; + testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, FasterRCNN_vgg16) +{ + applyTestTag( +#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL) + CV_TEST_TAG_MEMORY_2GB, +#else + (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB), +#endif + CV_TEST_TAG_LONG, + CV_TEST_TAG_DEBUG_VERYLONG + ); + +#if defined(INF_ENGINE_RELEASE) + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16)) + applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16); + + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); +#endif + + Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_vgg16.prototxt"), + findDataFile("dnn/VGG16_faster_rcnn_final.caffemodel", false)); + + Mat ref = (Mat_(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849, + 0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953, + 0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166); + + float confThreshold = 0.8, scoreDiff = 0.024, iouDiff = 0.35; + testFaster(net, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, FasterRCNN_zf) +{ + applyTestTag( +#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL) + CV_TEST_TAG_MEMORY_2GB, +#else + (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB), +#endif + CV_TEST_TAG_DEBUG_LONG + ); + + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || + backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16); + + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || + backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); + + if (target == DNN_TARGET_CUDA_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); + + Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_zf.prototxt"), + findDataFile("dnn/ZF_faster_rcnn_final.caffemodel", false)); + + Mat ref = (Mat_(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395, + 0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762, + 0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176); + + float confThreshold = 0.8, scoreDiff = 0.021, iouDiff = 0.1; + testFaster(net, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, RFCN) +{ + applyTestTag( + (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_2GB), + CV_TEST_TAG_LONG, + CV_TEST_TAG_DEBUG_VERYLONG + ); + + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || + backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16); + + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || + backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); + + Net net = readNetFromCaffe(findDataFile("dnn/rfcn_pascal_voc_resnet50.prototxt"), + findDataFile("dnn/resnet50_rfcn_final.caffemodel", false)); + + Mat ref = (Mat_(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234, + 0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16); + + float confThreshold = 0.8, scoreDiff = 0.017, iouDiff = 0.11; + testFaster(net, ref, confThreshold, scoreDiff, iouDiff); +} + +TEST_P(Test_Int8_nets, YoloVoc) +{ + applyTestTag( +#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL) + CV_TEST_TAG_MEMORY_2GB, +#else + CV_TEST_TAG_MEMORY_1GB, +#endif + CV_TEST_TAG_LONG + ); + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16); +#endif +#if defined(INF_ENGINE_RELEASE) + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && + target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); +#endif + + Mat ref = (Mat_(6, 7) << 0, 6, 0.750469f, 0.577374f, 0.127391f, 0.902949f, 0.300809f, + 0, 1, 0.780879f, 0.270762f, 0.264102f, 0.732475f, 0.745412f, + 0, 11, 0.901615f, 0.1386f, 0.338509f, 0.421337f, 0.938789f, + 1, 14, 0.623813f, 0.183179f, 0.381921f, 0.247726f, 0.625847f, + 1, 6, 0.667770f, 0.446555f, 0.453578f, 0.499986f, 0.519167f, + 1, 6, 0.844947f, 0.637058f, 0.460398f, 0.828508f, 0.66427f); + + std::string config_file = "yolo-voc.cfg"; + std::string weights_file = "yolo-voc.weights"; + + double scoreDiff = 0.1, iouDiff = 0.3; + { + SCOPED_TRACE("batch size 1"); + testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff); + } + + { + SCOPED_TRACE("batch size 2"); + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff); + } +} + +TEST_P(Test_Int8_nets, TinyYoloVoc) +{ + applyTestTag(CV_TEST_TAG_MEMORY_512MB); + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif +#if defined(INF_ENGINE_RELEASE) + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && + target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); +#endif + + Mat ref = (Mat_(4, 7) << 0, 6, 0.761967f, 0.579042f, 0.159161f, 0.894482f, 0.31994f, + 0, 11, 0.780595f, 0.129696f, 0.386467f, 0.445275f, 0.920994f, + 1, 6, 0.651450f, 0.460526f, 0.458019f, 0.522527f, 0.5341f, + 1, 6, 0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f); + + std::string config_file = "tiny-yolo-voc.cfg"; + std::string weights_file = "tiny-yolo-voc.weights"; + + double scoreDiff = 0.043, iouDiff = 0.12; + { + SCOPED_TRACE("batch size 1"); + testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff); + } + + { + SCOPED_TRACE("batch size 2"); + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff); + } +} + +TEST_P(Test_Int8_nets, YOLOv3) +{ + applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB)); + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif + + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH); + + const int N0 = 3; + const int N1 = 6; + static const float ref_[/* (N0 + N1) * 7 */] = { +0, 16, 0.998836f, 0.160024f, 0.389964f, 0.417885f, 0.943716f, +0, 1, 0.987908f, 0.150913f, 0.221933f, 0.742255f, 0.746261f, +0, 7, 0.952983f, 0.614621f, 0.150257f, 0.901368f, 0.289251f, + +1, 2, 0.997412f, 0.647584f, 0.459939f, 0.821037f, 0.663947f, +1, 2, 0.989633f, 0.450719f, 0.463353f, 0.496306f, 0.522258f, +1, 0, 0.980053f, 0.195856f, 0.378454f, 0.258626f, 0.629257f, +1, 9, 0.785341f, 0.665503f, 0.373543f, 0.688893f, 0.439244f, +1, 9, 0.733275f, 0.376029f, 0.315694f, 0.401776f, 0.395165f, +1, 9, 0.384815f, 0.659824f, 0.372389f, 0.673927f, 0.429412f, + }; + Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); + + std::string config_file = "yolov3.cfg"; + std::string weights_file = "yolov3.weights"; + + double scoreDiff = 0.08, iouDiff = 0.21, confThreshold = 0.25; + { + SCOPED_TRACE("batch size 1"); + testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold); + } + +#if defined(INF_ENGINE_RELEASE) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) + { + if (target == DNN_TARGET_OPENCL) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + else if (target == DNN_TARGET_OPENCL_FP16 && INF_ENGINE_VER_MAJOR_LE(202010000)) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + else if (target == DNN_TARGET_MYRIAD && + getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); + } +#endif + + { + SCOPED_TRACE("batch size 2"); + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, confThreshold); + } +} + +TEST_P(Test_Int8_nets, YOLOv4) +{ + applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB)); + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif +#if defined(INF_ENGINE_RELEASE) + if (target == DNN_TARGET_MYRIAD) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif + + const int N0 = 3; + const int N1 = 7; + static const float ref_[/* (N0 + N1) * 7 */] = { +0, 16, 0.992194f, 0.172375f, 0.402458f, 0.403918f, 0.932801f, +0, 1, 0.988326f, 0.166708f, 0.228236f, 0.737208f, 0.735803f, +0, 7, 0.94639f, 0.602523f, 0.130399f, 0.901623f, 0.298452f, + +1, 2, 0.99761f, 0.646556f, 0.45985f, 0.816041f, 0.659067f, +1, 0, 0.988913f, 0.201726f, 0.360282f, 0.266181f, 0.631728f, +1, 2, 0.98233f, 0.452007f, 0.462217f, 0.495612f, 0.521687f, +1, 9, 0.919195f, 0.374642f, 0.316524f, 0.398126f, 0.393714f, +1, 9, 0.856303f, 0.666842f, 0.372215f, 0.685539f, 0.44141f, +1, 9, 0.313516f, 0.656791f, 0.374734f, 0.671959f, 0.438371f, +1, 9, 0.256625f, 0.940232f, 0.326931f, 0.967586f, 0.374002f, + }; + Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); + + std::string config_file = "yolov4.cfg"; + std::string weights_file = "yolov4.weights"; + double scoreDiff = 0.1, iouDiff = 0.17; + { + SCOPED_TRACE("batch size 1"); + testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff); + } + + { + SCOPED_TRACE("batch size 2"); + +#if defined(INF_ENGINE_RELEASE) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) + { + if (target == DNN_TARGET_OPENCL) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + else if (target == DNN_TARGET_OPENCL_FP16 && INF_ENGINE_VER_MAJOR_LE(202010000)) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + else if (target == DNN_TARGET_MYRIAD && + getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); + } +#endif + + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff); + } +} + +TEST_P(Test_Int8_nets, YOLOv4_tiny) +{ + applyTestTag( + target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB + ); + +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000) + if (target == DNN_TARGET_MYRIAD) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif + + const float confThreshold = 0.6; + + const int N0 = 2; + const int N1 = 3; + static const float ref_[/* (N0 + N1) * 7 */] = { +0, 7, 0.85935f, 0.593484f, 0.141211f, 0.920356f, 0.291593f, +0, 16, 0.795188f, 0.169207f, 0.386886f, 0.423753f, 0.933004f, + +1, 2, 0.996832f, 0.653802f, 0.464573f, 0.815193f, 0.653292f, +1, 2, 0.963325f, 0.451151f, 0.458915f, 0.496255f, 0.52241f, +1, 0, 0.926244f, 0.194851f, 0.361743f, 0.260277f, 0.632364f, + }; + Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); + + std::string config_file = "yolov4-tiny.cfg"; + std::string weights_file = "yolov4-tiny.weights"; + double scoreDiff = 0.12; + double iouDiff = target == DNN_TARGET_OPENCL_FP16 ? 0.2 : 0.082; + +#if defined(INF_ENGINE_RELEASE) + if (target == DNN_TARGET_MYRIAD) // bad accuracy + iouDiff = std::numeric_limits::quiet_NaN(); + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL) + iouDiff = std::numeric_limits::quiet_NaN(); + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || + backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16) + iouDiff = std::numeric_limits::quiet_NaN(); +#endif + + { + SCOPED_TRACE("batch size 1"); + testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold); + } + + /* bad accuracy on second image + { + SCOPED_TRACE("batch size 2"); + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, confThreshold); + } + */ + +#if defined(INF_ENGINE_RELEASE) + if (target == DNN_TARGET_MYRIAD) // bad accuracy + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION); + if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || + backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif +} + +INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_nets, dnnBackendsAndTargets()); +}} // namespace