Merge pull request #24980 from fengyuentau:on-fly-quantization-removal

dnn cleanup: On-fly-quantization removal #2498

On-fly-quantization is first introduced via https://github.com/opencv/opencv/pull/20228.
We decided to remove it but keep int8 layers implementation because on-fly-quantization
is less practical given the fact that there has been so many dedicated tools for model
quantization.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
pull/25041/head
Yuantao Feng 12 months ago committed by GitHub
parent 3125f9708d
commit d4fd5157fa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 30
      modules/dnn/include/opencv2/dnn/dnn.hpp
  2. 6
      modules/dnn/src/layer.cpp
  3. 13
      modules/dnn/src/layers/batch_norm_layer.cpp
  4. 6
      modules/dnn/src/layers/blank_layer.cpp
  5. 8
      modules/dnn/src/layers/concat_layer.cpp
  6. 10
      modules/dnn/src/layers/const_layer.cpp
  7. 53
      modules/dnn/src/layers/convolution_layer.cpp
  8. 64
      modules/dnn/src/layers/elementwise_layers.cpp
  9. 32
      modules/dnn/src/layers/eltwise_layer.cpp
  10. 6
      modules/dnn/src/layers/flatten_layer.cpp
  11. 51
      modules/dnn/src/layers/fully_connected_layer.cpp
  12. 6
      modules/dnn/src/layers/nary_eltwise_layers.cpp
  13. 10
      modules/dnn/src/layers/padding_layer.cpp
  14. 6
      modules/dnn/src/layers/permute_layer.cpp
  15. 17
      modules/dnn/src/layers/pooling_layer.cpp
  16. 6
      modules/dnn/src/layers/reorg_layer.cpp
  17. 6
      modules/dnn/src/layers/reshape_layer.cpp
  18. 6
      modules/dnn/src/layers/resize_layer.cpp
  19. 8
      modules/dnn/src/layers/scale_layer.cpp
  20. 6
      modules/dnn/src/layers/shuffle_channel_layer.cpp
  21. 12
      modules/dnn/src/layers/slice_layer.cpp
  22. 18
      modules/dnn/src/layers/softmax_layer.cpp
  23. 12
      modules/dnn/src/layers/split_layer.cpp
  24. 27
      modules/dnn/src/net.cpp
  25. 5
      modules/dnn/src/net_impl.hpp
  26. 304
      modules/dnn/src/net_quantization.cpp
  27. 6
      modules/dnn/test/test_int8_layers.cpp

@ -259,15 +259,6 @@ CV__DNN_INLINE_NS_BEGIN
*/
virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
/** @brief Tries to quantize the given layer and compute the quantization parameters required for fixed point implementation.
* @param[in] scales input and output scales.
* @param[in] zeropoints input and output zeropoints.
* @param[out] params Quantized parameters required for fixed point implementation of that layer.
* @returns True if layer can be quantized.
*/
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params);
/** @brief Given the @p input blobs, computes the output @p blobs.
* @param[in] inputs the input blobs.
* @param[out] outputs allocated output blobs, which will store results of the computation.
@ -610,27 +601,6 @@ CV__DNN_INLINE_NS_BEGIN
CV_WRAP_AS(forwardAndRetrieve) void forward(CV_OUT std::vector<std::vector<Mat> >& outputBlobs,
const std::vector<String>& outBlobNames);
/** @brief Returns a quantized Net from a floating-point Net.
* @param calibData Calibration data to compute the quantization parameters.
* @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S.
* @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S.
* @param perChannel Quantization granularity of quantized Net. The default is true, that means quantize model
* in per-channel way (channel-wise). Set it false to quantize model in per-tensor way (or tensor-wise).
*/
CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel=true);
/** @brief Returns input scale and zeropoint for a quantized Net.
* @param scales output parameter for returning input scales.
* @param zeropoints output parameter for returning input zeropoints.
*/
CV_WRAP void getInputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
/** @brief Returns output scale and zeropoint for a quantized Net.
* @param scales output parameter for returning output scales.
* @param zeropoints output parameter for returning output zeropoints.
*/
CV_WRAP void getOutputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
/**
* @brief Ask network to use specific computation backend where it supported.
* @param[in] backendId backend identifier.

@ -228,12 +228,6 @@ void Layer::run(const std::vector<Mat>& inputs, std::vector<Mat>& outputs, std::
this->forward(inputs, outputs, internals);
}
bool Layer::tryQuantize(const std::vector<std::vector<float>>& scales,
const std::vector<std::vector<int>>& zeropoints, LayerParams& params)
{
return false;
}
Layer::~Layer() {}
bool Layer::getMemoryShapes(const std::vector<MatShape>& inputs,

@ -423,19 +423,6 @@ public:
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
params.set("input_scale", scales[0][0]);
params.set("input_zeropoint", zeropoints[0][0]);
params.set("eps", epsilon);
params.blobs.clear();
params.blobs.push_back(origin_weights);
params.blobs.push_back(origin_bias);
return true;
}
#ifdef HAVE_WEBNN
virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{

@ -168,12 +168,6 @@ public:
return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
};
Ptr<Layer> BlankLayer::create(const LayerParams& params)

@ -490,14 +490,6 @@ public:
}
#endif // HAVE_TIMVX
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
if (padding)
params.set("padding_value", zeropoints[1][0]);
return true;
}
#ifdef HAVE_WEBNN
virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{

@ -177,16 +177,6 @@ public:
return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blob);
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
Mat quantizedBlob;
blobs[0].convertTo(quantizedBlob, CV_8S, 1.f/scales[1][0], zeropoints[1][0]);
params.blobs.clear();
params.blobs.push_back(quantizedBlob);
return true;
}
};
Ptr<Layer> ConstLayer::create(const LayerParams& params)

@ -1297,59 +1297,6 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
// References - https://arxiv.org/pdf/1712.05877.pdf
// Quantized convolution with variable weights is not supported.
if (blobs.empty())
return false;
float inputScale = scales[0][0], outputScale = scales[1][0];
int inputZp = zeropoints[0][0];
params.set("input_zeropoint", inputZp);
params.set("input_scale", inputScale);
Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
Mat biasQuantized(1, numOutput, CV_32S);
Mat outputMultiplier(1, numOutput, CV_32F);
bool perChannel = params.get<bool>("per_channel", true);
if (perChannel) // per-Channel quantization.
{
for (int i = 0; i < numOutput; i++)
{
double weightsScale = getWeightScale(weightsMat.row(i));
weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
float biasScale = inputScale * weightsScale;
biasQuantized.at<int>(i) = cvRound(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
outputMultiplier.at<float>(i) = biasScale / outputScale;
}
}
else // per-Tensor quantization.
{
double weightsScale = getWeightScale(weightsMat);
weightsMat.convertTo(weightsQuantized, CV_8S, 1.f/weightsScale);
float biasScale = inputScale * weightsScale;
for (int i = 0; i < numOutput; i++)
{
biasQuantized.at<int>(i) = cvRound(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
outputMultiplier.at<float>(i) = biasScale / outputScale;
}
}
params.blobs.clear();
params.set("per_channel", perChannel);
params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
params.blobs.push_back(biasQuantized);
params.blobs.push_back(outputMultiplier);
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -250,12 +250,6 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return func.tryQuantize(scales, zeropoints, params);
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
@ -289,8 +283,6 @@ struct BaseFunctor
bool tryFuse(Ptr<dnn::Layer>&) { return false; }
void getScaleShift(Mat&, Mat&) const {}
bool tryQuantize(const std::vector<std::vector<float>>&, const std::vector<std::vector<int>>&, LayerParams&) { return false; }
};
struct ReLUFunctor : public BaseFunctor
@ -458,32 +450,6 @@ struct ReLUFunctor : public BaseFunctor
}
#endif
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
if (slope != 0.f)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float y = x >= 0.f ? x : slope*x;
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
}
params.set("input_scale", scales[0][0]);
params.set("input_zeropoint", zeropoints[0][0]);
params.set("slope", slope);
return true;
}
int64 getFLOPSPerElement() const { return 1; }
};
@ -634,14 +600,6 @@ struct ReLU6Functor : public BaseFunctor
}
#endif
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
params.set("input_scale", scales[0][0]);
params.set("input_zeropoint", zeropoints[0][0]);
return true;
}
int64 getFLOPSPerElement() const { return 2; }
};
@ -692,28 +650,6 @@ struct BaseDefaultFunctor : public BaseFunctor
inline void setKernelParams(ocl::Kernel& kernel) const {}
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale * static_cast<float>(i - inpZp);
float y = static_cast<T const*>(this)->calculate(x);
int quantized = outZp + static_cast<int>(std::round(y/outScale));
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
params.set("input_scale", scales[0][0]);
params.set("input_zeropoint", zeropoints[0][0]);
return true;
}
#ifdef HAVE_CUDA
Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
{

@ -864,38 +864,6 @@ public:
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
if (op == SUM)
{
std::vector<float> newCoeffs;
float offset = zeropoints[1][0];
float out_sc = scales[1][0];
for (int i = 0; i < scales[0].size(); i++)
{
float coeff = coeffs.empty() ? 1.f : coeffs[i];
float newcoeff = (scales[0][i] * coeff) / out_sc;
newCoeffs.push_back(newcoeff);
offset -= (newcoeff * zeropoints[0][i]);
}
params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
params.set("offset", offset);
return true;
}
else if (op == PROD)
{
std::vector<float> newCoeffs = scales[0];
newCoeffs[0] /= scales[1][0];
params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
params.set("offset", zeropoints[1][0]);
return true;
}
return op == MAX;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -244,12 +244,6 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
int _startAxis;
int _endAxis;
};

@ -796,57 +796,6 @@ public:
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
if (blobs.empty())
return false;
int numOutput = blobs[0].size[0];
float inputScale = scales[0][0], outputScale = scales[1][0];
int inputZp = zeropoints[0][0];
Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
Mat biasQuantized(1, numOutput, CV_32S);
Mat outputMultiplier(1, numOutput, CV_32F);
bool perChannel = params.get<bool>("per_channel", true);
if (perChannel) // per-Channel quantization.
{
for (int i = 0; i < numOutput; i++)
{
double weightsScale = getWeightScale(weightsMat.row(i));
weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
float biasScale = inputScale * weightsScale;
biasQuantized.at<int>(i) = cvRound(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
outputMultiplier.at<float>(i) = biasScale / outputScale;
}
}
else // per-Tensor quantization.
{
double weightsScale = getWeightScale(weightsMat);
weightsMat.convertTo(weightsQuantized, CV_8S, 1.f/weightsScale);
float biasScale = inputScale * weightsScale;
for (int i = 0; i < numOutput; i++)
{
biasQuantized.at<int>(i) = cvRound(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
outputMultiplier.at<float>(i) = biasScale / outputScale;
}
}
params.blobs.clear();
params.set("per_channel", perChannel);
params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
params.blobs.push_back(biasQuantized);
params.blobs.push_back(outputMultiplier);
params.set("input_scale", inputScale);
params.set("input_zeropoint", inputZp);
return true;
}
#ifdef HAVE_WEBNN
virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{

@ -880,12 +880,6 @@ public:
}
#endif // HAVE_CANN
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return false;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -257,16 +257,6 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
float outputScale = scales[1][0];
int outputZp = zeropoints[1][0];
float padValue = outputZp + std::round(params.get<float>("value", 0)/outputScale);
params.set("value", padValue);
return true;
}
private:
std::vector<std::pair<int, int> > paddings; // Pairs pad before, pad after.
std::vector<Range> dstRanges;

@ -587,12 +587,6 @@ public:
}
#endif // HAVE_TIMVX
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
// convert OpenCV NCHW order to WHCN order.
bool getOrderWHCN(std::vector<uint32_t>& orderWHCN)
{

@ -1259,23 +1259,6 @@ public:
return true;
}
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
if (type == MAX && !computeMaxIdx)
{
return true;
}
else if (type == AVE || type == SUM)
{
float multiplier = scales[0][0] / scales[1][0];
params.set("multiplier", multiplier);
params.set("input_zeropoint", zeropoints[0][0]);
return true;
}
return false;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -223,12 +223,6 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -493,12 +493,6 @@ public:
return Ptr<BackendNode>();
}
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
private:
int axis;
int numAxes;

@ -490,12 +490,6 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
protected:
int outWidth, outHeight;
const float zoomFactorWidth, zoomFactorHeight;

@ -365,14 +365,6 @@ public:
shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
}
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -147,12 +147,6 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
private:
Ptr<PermuteLayer> permute;
std::vector<int> permuteInpShape, permuteOutShape;

@ -805,18 +805,6 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
const int numOutputs = scales[1].size();
for (int i = 0; i < numOutputs; i++)
{
if (scales[1][i] != scales[0][0])
return false;
}
return true;
}
private:
template <typename T>
void getSliceRecursive(const Mat &inpMat, std::vector<int> &inpIdx,

@ -294,24 +294,6 @@ public:
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
float inpScale = scales[0][0];
Mat lookUpTable(1, 256, CV_32F);
float* table = lookUpTable.ptr<float>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - 127); // ensures exp(x) is always between (0, 1)
table[i+128] = std::exp(x);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
params.set("input_scale", inpScale);
params.set("input_zeropoint", zeropoints[0][0]);
return true;
}
#ifdef HAVE_WEBNN
virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
{

@ -116,18 +116,6 @@ public:
return make_cuda_node<cuda4dnn::SplitOp>(preferableTarget, std::move(context->stream));
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
const int numOutputs = scales[1].size();
for (int i = 0; i < numOutputs; i++)
{
if (scales[1][i] != scales[0][0])
return false;
}
return true;
}
};
Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)

@ -114,33 +114,6 @@ void Net::forward(std::vector<std::vector<Mat>>& outputBlobs,
return impl->forward(outputBlobs, outBlobNames);
}
// FIXIT drop from inference API
Net Net::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel)
{
CV_TRACE_FUNCTION();
CV_Assert(impl);
CV_Assert(!empty());
return impl->quantize(*this, calibData, inputsDtype, outputsDtype, perChannel);
}
// FIXIT drop from inference API
void Net::getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
{
CV_TRACE_FUNCTION();
CV_Assert(impl);
CV_Assert(!empty());
return impl->getInputDetails(scales, zeropoints);
}
// FIXIT drop from inference API
void Net::getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
{
CV_TRACE_FUNCTION();
CV_Assert(impl);
CV_Assert(!empty());
return impl->getOutputDetails(scales, zeropoints);
}
void Net::setPreferableBackend(int backendId)
{
CV_TRACE_FUNCTION();

@ -273,11 +273,6 @@ struct Net::Impl : public detail::NetImplBase
void dumpNetworkToFile() const;
// FIXIT drop from inference API
Net quantize(Net& net, InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel) /*const*/;
void getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/;
void getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/;
}; // Net::Impl

@ -1,304 +0,0 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "precomp.hpp"
#include "net_impl.hpp"
namespace cv {
namespace dnn {
CV__DNN_INLINE_NS_BEGIN
// FIXIT drop from inference API
static
void getQuantizationParams(const Mat& src, std::vector<float>& scales, std::vector<int>& zeropoints)
{
const int qmin = -128; // INT8_MIN
const int qmax = 127; // INT8_MAX
double rmin, rmax, sc, zp;
cv::minMaxIdx(src, &rmin, &rmax);
// 0 must be present in the range [rmin, rmax]
rmin = std::min(rmin, 0.0);
rmax = std::max(rmax, 0.0);
sc = (rmax == rmin) ? 1.0 : (rmax - rmin)/(qmax - qmin);
zp = qmin - (rmin/sc);
scales.push_back((float)sc);
zeropoints.push_back((int)std::round(zp));
}
// FIXIT drop from inference API
Net Net::Impl::quantize(Net& net, InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel)
{
// Net can be quantized only once.
if (netWasQuantized)
CV_Error(Error::StsBadArg, "Cannot quantize a quantized net");
CV_CheckType(inputsDtype, inputsDtype == CV_32F || inputsDtype == CV_8S, "Input depth should be CV_32F or CV_8S");
CV_CheckType(outputsDtype, outputsDtype == CV_32F || outputsDtype == CV_8S, "Output depth should be CV_32F or CV_8S");
bool originalFusion = fusion;
int prefBackend = preferableBackend;
int prefTarget = preferableTarget;
// Disable fusions and use CPU backend to quantize net
// FIXIT: we should not modify original network!
setPreferableBackend(net, DNN_BACKEND_OPENCV);
setPreferableTarget(DNN_TARGET_CPU);
enableFusion(false);
enableWinograd(false);
if (calibData.isMat())
{
setInput(calibData.getMat(), /*name=*/"", /*scalefactor=*/1.0, /*mean=*/Scalar());
}
else if (calibData.isMatVector())
{
std::vector<Mat> calibDataVec;
calibData.getMatVector(calibDataVec);
std::vector<String> inpNames = netInputLayer->outNames;
CV_CheckEQ(calibDataVec.size(), inpNames.size(), "Calibration data size should be equal to number of inputs");
for (int i = 0; i < calibDataVec.size(); i++)
setInput(calibDataVec[i], inpNames[i], /*scalefactor=*/1.0, /*mean=*/Scalar());
}
std::vector<String> outNames = getUnconnectedOutLayersNames();
std::vector<LayerPin> pins;
for (int i = 0; i < outNames.size(); i++)
pins.push_back(getPinByAlias(outNames[i]));
setUpNet(pins);
// Compute scales and zeropoints for all the layers
std::vector<std::vector<float> > scales;
std::vector<std::vector<int> > zeropoints;
for (Impl::MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); it++)
{
LayerData& ld = it->second;
if (!ld.skip)
{
Ptr<Layer> layer = ld.layerInstance;
std::vector<Mat> inps(ld.inputBlobs.size());
for (int i = 0; i < ld.inputBlobs.size(); ++i)
inps[i] = *ld.inputBlobs[i];
layer->forward(inps, ld.outputBlobs, ld.internals);
}
std::vector<float> sc;
std::vector<int> zp;
if (ld.type == "TanH")
{
sc.push_back(1.f/128);
zp.push_back(0);
}
else if (ld.type == "Sigmoid" || ld.type == "Softmax" || ld.type == "SoftMax")
{
if (ld.params.get<bool>("log_softmax", false))
{
sc.push_back(16.f/256);
zp.push_back(127);
}
else
{
sc.push_back(1.f/256);
zp.push_back(-128);
}
}
else if (ld.type == "Split" || ld.type == "Slice" || ld.type == "Crop")
{
std::vector<float> inp_sc; std::vector<int> inp_zp;
getQuantizationParams(*ld.inputBlobs[0], inp_sc, inp_zp);
sc.assign(ld.outputBlobs.size(), inp_sc[0]);
zp.assign(ld.outputBlobs.size(), inp_zp[0]);
}
else
{
for (int i = 0; i < ld.outputBlobs.size(); i++)
getQuantizationParams(ld.outputBlobs[i], sc, zp);
}
scales.push_back(sc);
zeropoints.push_back(zp);
}
// For some layers, the input and output scales/zeropoints must be equal so that rescaling of inputs
// is not needed during quantized inference. We start from the last layer and modify the layer's input scales/zeropoints
// TODO : Need a different approach. Current solution fails when 2 such layers have the same input layer
for (Impl::MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
{
LayerData& ld = it->second;
// Layers with multiple outputs. Number of outputs is equal to number of inputs
if (ld.type == "Blank" || ld.type == "Dropout" || ld.type == "Identity" || ld.type == "Silence" ||
ld.type == "Flatten" || ld.type == "Padding" || ld.type == "Permute" || ld.type == "Reshape" ||
ld.type == "ReLU6" || ld.type == "Reorg" || ld.type == "ShuffleChannel" || ld.type == "Resize" ||
(ld.type == "ReLU" && !ld.params.get<float>("negative_slope", 0.f)) || /* ReLU with negative slope 0 */
(ld.type == "Reduce" && (toLowerCase(ld.params.get<String>("reduce")) == "max" ||
toLowerCase(ld.params.get<String>("reduce")) == "min")))
{
for (int i = 0; i < ld.outputBlobs.size(); i++)
{
LayerPin &pin = ld.inputBlobsId[i];
scales[pin.lid][pin.oid] = scales[ld.id][i];
zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][i];
}
}
// Layers with multiple inputs and single output.
else if ((ld.type == "Pooling" && toLowerCase(ld.params.get<String>("pool", "max")) == "max") /* Max Pooling */ ||
(ld.type == "Eltwise" && toLowerCase(ld.params.get<String>("operation", "sum")) == "max") /* Elementwise max */ ||
ld.type == "Concat")
{
for (int i = 0; i < ld.inputBlobsId.size(); i++)
{
LayerPin &pin = ld.inputBlobsId[i];
scales[pin.lid][pin.oid] = scales[ld.id][0];
zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][0];
}
}
}
// Create a new Net and add quantized layers to it.
Net dstNet_;
Net::Impl& dstNet = *(dstNet_.impl);
dstNet.netWasQuantized = true;
dstNet.setInputsNames(netInputLayer->outNames);
dstNet.setPreferableBackend(dstNet_, prefBackend);
dstNet.setPreferableTarget(prefTarget);
dstNet.enableFusion(originalFusion);
for (Impl::MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); it++)
{
LayerData ld = it->second;
if (ld.id == 0)
{
LayerData &quantInpLd = dstNet.layers[0];
quantInpLd.dtype = inputsDtype;
quantInpLd.params.set("scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
quantInpLd.params.set("zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
continue;
}
std::vector<LayerPin> inpPins = ld.inputBlobsId;
// Fill input and output scales/zeropoints for the layer
std::vector<std::vector<float> > inp_out_sc(2);
std::vector<std::vector<int> > inp_out_zp(2);
for (int i = 0; i < inpPins.size(); i++)
{
LayerPin &pin = inpPins[i];
inp_out_sc[0].push_back(scales[pin.lid][pin.oid]);
inp_out_zp[0].push_back(zeropoints[pin.lid][pin.oid]);
}
inp_out_sc[1] = scales[ld.id];
inp_out_zp[1] = zeropoints[ld.id];
// Set the quantization type, per-tensor quantize or per-channel quantize.
// Especially for Convolution layer and Fully connection layer.
ld.params.set("per_channel", perChannel);
// Quantize layer
Ptr<Layer> layer = ld.layerInstance;
if (layer->tryQuantize(inp_out_sc, inp_out_zp, ld.params))
{
ld.type += "Int8";
ld.dtype = CV_8S;
}
ld.params.set("scales", DictValue::arrayReal(inp_out_sc[1].data(), inp_out_sc[1].size()));
ld.params.set("zeropoints", DictValue::arrayInt(inp_out_zp[1].data(), inp_out_zp[1].size()));
// Check and add quantize/dequantize node before layer
for (int i = 0; i < inpPins.size(); i++)
{
LayerPin &pin = inpPins[i];
LayerData &inpLd = dstNet.getLayerData(getLayerName(pin.lid));
pin.lid = inpLd.id;
if (inpLd.dtype != ld.dtype)
{
String layerName = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? cv::format("quantize/%s/%d", inpLd.name.c_str(), pin.oid)
: cv::format("dequantize/%s/%d", inpLd.name.c_str(), pin.oid);
// Check if quantize/dequantize node for the input layer already exists
if (dstNet.getLayerId(layerName) >= 0)
{
pin.lid = dstNet.getLayerId(layerName);
pin.oid = 0;
}
else
{
LayerParams lp;
lp.set("scales", inp_out_sc[0][i]);
lp.set("zeropoints", inp_out_zp[0][i]);
lp.name = layerName;
lp.type = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? "Quantize" : "Dequantize";
int newLid = dstNet.addLayer(lp.name, lp.type, ld.dtype, lp);
dstNet.connect(pin.lid, pin.oid, newLid, 0);
pin.lid = newLid; pin.oid = 0;
}
}
}
// Add quantized layer to Net and connect to its inputs.
int newLid = dstNet.addLayer(ld.name, ld.type, ld.dtype, ld.params);
for( int i = 0; i < inpPins.size(); i++ )
dstNet.connect(inpPins[i].lid, inpPins[i].oid, newLid, i);
// If the layer is a output layer, add quantize/dequantize node after it based on output's data type.
if (ld.requiredOutputs.size() == 0 && ld.dtype != outputsDtype)
{
LayerParams lp;
lp.set("scales", inp_out_sc[1][0]);
lp.set("zeropoints", inp_out_zp[1][0]);
lp.name = ((ld.dtype == CV_32F && outputsDtype == CV_8S) ? "quantize/" : "dequantize/") + ld.name;
lp.type = (ld.dtype == CV_32F && outputsDtype == CV_8S) ? "Quantize" : "Dequantize";
dstNet.addLayerToPrev(lp.name, lp.type, outputsDtype, lp);
}
}
// Restore FP32 Net's backend, target and fusion
setPreferableBackend(net, prefBackend);
setPreferableTarget(prefTarget);
enableFusion(originalFusion);
return dstNet_;
}
// FIXIT drop from inference API
void Net::Impl::getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/
{
if (!netWasQuantized)
CV_Error(Error::StsBadFunc, "Net isn't quantized");
LayerParams &lp = layers[0].params;
DictValue sc = lp.get("scales");
DictValue zp = lp.get("zeropoints");
for (int i = 0; i < sc.size(); i++)
{
scales.push_back(sc.get<float>(i));
zeropoints.push_back(zp.get<int>(i));
}
}
// FIXIT drop from inference API
void Net::Impl::getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/
{
if (!netWasQuantized)
CV_Error(Error::StsBadFunc, "Net isn't quantized");
std::vector<int> outLayerIds = getUnconnectedOutLayers();
for (auto &lid : outLayerIds)
{
LayerParams &lp = layers[lid].params;
DictValue sc = lp.get("scales");
DictValue zp = lp.get("zeropoints");
for (int i = 0; i < sc.size(); i++)
{
scales.push_back(sc.get<float>(i));
zeropoints.push_back(zp.get<int>(i));
}
}
}
CV__DNN_INLINE_NS_END
}} // namespace cv::dnn

@ -2,6 +2,10 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// The tests are disabled, because on-fly quantization was removed in https://github.com/opencv/opencv/pull/24980
// To be restored, when test models are quantized outsize of OpenCV
#if 0
#include "test_precomp.hpp"
#include "npy_blob.hpp"
#include <opencv2/dnn/shape_utils.hpp>
@ -1389,3 +1393,5 @@ TEST_P(Test_Int8_nets, YOLOv4_tiny)
INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_nets, dnnBackendsAndTargetsInt8());
}} // namespace
#endif // #if 0

Loading…
Cancel
Save