Merge pull request #24980 from fengyuentau:on-fly-quantization-removal

dnn cleanup: On-fly-quantization removal #2498 On-fly-quantization is first introduced via https://github.com/opencv/opencv/pull/20228. We decided to remove it but keep int8 layers implementation because on-fly-quantization is less practical given the fact that there has been so many dedicated tools for model quantization. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
12 months ago · d4fd5157fa
parent 3125f9708d
commit d4fd5157fa
27 changed files with 6 additions and 728 deletions
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -259,15 +259,6 @@ CV__DNN_INLINE_NS_BEGIN
         */
        virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);

-        /** @brief Tries to quantize the given layer and compute the quantization parameters required for fixed point implementation.
-         *  @param[in] scales input and output scales.
-         *  @param[in] zeropoints input and output zeropoints.
-         *  @param[out] params Quantized parameters required for fixed point implementation of that layer.
-         *  @returns True if layer can be quantized.
-         */
-        virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                                 const std::vector<std::vector<int> > &zeropoints, LayerParams& params);
-
        /** @brief Given the @p input blobs, computes the output @p blobs.
         *  @param[in]  inputs  the input blobs.
         *  @param[out] outputs allocated output blobs, which will store results of the computation.
@ -610,27 +601,6 @@ CV__DNN_INLINE_NS_BEGIN
        CV_WRAP_AS(forwardAndRetrieve) void forward(CV_OUT std::vector<std::vector<Mat> >& outputBlobs,
                                                    const std::vector<String>& outBlobNames);

-        /** @brief Returns a quantized Net from a floating-point Net.
-         *  @param calibData Calibration data to compute the quantization parameters.
-         *  @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S.
-         *  @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S.
-         *  @param perChannel Quantization granularity of quantized Net. The default is true, that means quantize model
-         *  in per-channel way (channel-wise). Set it false to quantize model in per-tensor way (or tensor-wise).
-         */
-        CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel=true);
-
-        /** @brief Returns input scale and zeropoint for a quantized Net.
-         *  @param scales output parameter for returning input scales.
-         *  @param zeropoints output parameter for returning input zeropoints.
-         */
-        CV_WRAP void getInputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
-
-        /** @brief Returns output scale and zeropoint for a quantized Net.
-         *  @param scales output parameter for returning output scales.
-         *  @param zeropoints output parameter for returning output zeropoints.
-         */
-        CV_WRAP void getOutputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
-
        /**
         * @brief Ask network to use specific computation backend where it supported.
         * @param[in] backendId backend identifier.
--- a/modules/dnn/src/layer.cpp
+++ b/modules/dnn/src/layer.cpp
@ -228,12 +228,6 @@ void Layer::run(const std::vector<Mat>& inputs, std::vector<Mat>& outputs, std::
    this->forward(inputs, outputs, internals);
 }

-bool Layer::tryQuantize(const std::vector<std::vector<float>>& scales,
-        const std::vector<std::vector<int>>& zeropoints, LayerParams& params)
-{
-    return false;
-}
-
 Layer::~Layer() {}

 bool Layer::getMemoryShapes(const std::vector<MatShape>& inputs,
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@ -423,19 +423,6 @@ public:
    }
 #endif  // HAVE_DNN_NGRAPH

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        params.set("input_scale", scales[0][0]);
-        params.set("input_zeropoint", zeropoints[0][0]);
-        params.set("eps", epsilon);
-
-        params.blobs.clear();
-        params.blobs.push_back(origin_weights);
-        params.blobs.push_back(origin_bias);
-        return true;
-    }
-
 #ifdef HAVE_WEBNN
    virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@ -168,12 +168,6 @@ public:
        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
    }
 #endif
-
-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return true;
-    }
 };

 Ptr<Layer> BlankLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@ -490,14 +490,6 @@ public:
    }
 #endif // HAVE_TIMVX

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        if (padding)
-            params.set("padding_value", zeropoints[1][0]);
-        return true;
-    }
-
 #ifdef HAVE_WEBNN
    virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@ -177,16 +177,6 @@ public:
        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blob);
    }
 #endif
-
-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        Mat quantizedBlob;
-        blobs[0].convertTo(quantizedBlob, CV_8S, 1.f/scales[1][0], zeropoints[1][0]);
-        params.blobs.clear();
-        params.blobs.push_back(quantizedBlob);
-        return true;
-    }
 };

 Ptr<Layer> ConstLayer::create(const LayerParams& params)
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -1297,59 +1297,6 @@ public:
    }
 #endif

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        // References - https://arxiv.org/pdf/1712.05877.pdf
-
-        // Quantized convolution with variable weights is not supported.
-        if (blobs.empty())
-            return false;
-
-        float inputScale = scales[0][0], outputScale = scales[1][0];
-        int inputZp = zeropoints[0][0];
-        params.set("input_zeropoint", inputZp);
-        params.set("input_scale", inputScale);
-
-        Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
-        Mat biasQuantized(1, numOutput, CV_32S);
-        Mat outputMultiplier(1, numOutput, CV_32F);
-        bool perChannel = params.get<bool>("per_channel", true);
-
-        if (perChannel) // per-Channel quantization.
-        {
-            for (int i = 0; i < numOutput; i++)
-            {
-                double weightsScale = getWeightScale(weightsMat.row(i));
-
-                weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
-                float biasScale = inputScale * weightsScale;
-                biasQuantized.at<int>(i) = cvRound(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
-                outputMultiplier.at<float>(i) = biasScale / outputScale;
-            }
-        }
-        else // per-Tensor quantization.
-        {
-            double weightsScale = getWeightScale(weightsMat);
-
-            weightsMat.convertTo(weightsQuantized, CV_8S, 1.f/weightsScale);
-            float biasScale = inputScale * weightsScale;
-
-            for (int i = 0; i < numOutput; i++)
-            {
-                biasQuantized.at<int>(i) = cvRound(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
-                outputMultiplier.at<float>(i) = biasScale / outputScale;
-            }
-        }
-
-        params.blobs.clear();
-        params.set("per_channel", perChannel);
-        params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
-        params.blobs.push_back(biasQuantized);
-        params.blobs.push_back(outputMultiplier);
-        return true;
-    }
-
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@ -250,12 +250,6 @@ public:
    }
 #endif

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return func.tryQuantize(scales, zeropoints, params);
-    }
-
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
@ -289,8 +283,6 @@ struct BaseFunctor
    bool tryFuse(Ptr<dnn::Layer>&) { return false; }

    void getScaleShift(Mat&, Mat&) const {}
-
-    bool tryQuantize(const std::vector<std::vector<float>>&, const std::vector<std::vector<int>>&, LayerParams&) { return false; }
 };

 struct ReLUFunctor : public BaseFunctor
@ -458,32 +450,6 @@ struct ReLUFunctor : public BaseFunctor
    }
 #endif

-    bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
-    {
-        if (slope != 0.f)
-        {
-            float inpScale = scales[0][0], outScale = scales[1][0];
-            int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
-
-            Mat lookUpTable(1, 256, CV_8S);
-            int8_t* table = lookUpTable.ptr<int8_t>();
-            for (int i = -128; i < 128; i++)
-            {
-                float x = inpScale*(i - inpZp);
-                float y = x >= 0.f ? x : slope*x;
-                int quantized = outZp + (int)std::round(y/outScale);
-                table[i+128] = saturate_cast<int8_t>(quantized);
-            }
-            params.blobs.clear();
-            params.blobs.push_back(lookUpTable);
-        }
-        params.set("input_scale", scales[0][0]);
-        params.set("input_zeropoint", zeropoints[0][0]);
-        params.set("slope", slope);
-        return true;
-    }
-
    int64 getFLOPSPerElement() const { return 1; }
 };

@ -634,14 +600,6 @@ struct ReLU6Functor : public BaseFunctor
    }
 #endif

-    bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
-    {
-        params.set("input_scale", scales[0][0]);
-        params.set("input_zeropoint", zeropoints[0][0]);
-        return true;
-    }
-
    int64 getFLOPSPerElement() const { return 2; }
 };

@ -692,28 +650,6 @@ struct BaseDefaultFunctor : public BaseFunctor

    inline void setKernelParams(ocl::Kernel& kernel) const {}

-    bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
-    {
-        float inpScale = scales[0][0], outScale = scales[1][0];
-        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
-
-        Mat lookUpTable(1, 256, CV_8S);
-        int8_t* table = lookUpTable.ptr<int8_t>();
-        for (int i = -128; i < 128; i++)
-        {
-            float x = inpScale * static_cast<float>(i - inpZp);
-            float y = static_cast<T const*>(this)->calculate(x);
-            int quantized = outZp + static_cast<int>(std::round(y/outScale));
-            table[i+128] = saturate_cast<int8_t>(quantized);
-        }
-        params.blobs.clear();
-        params.blobs.push_back(lookUpTable);
-        params.set("input_scale", scales[0][0]);
-        params.set("input_zeropoint", zeropoints[0][0]);
-        return true;
-    }
-
 #ifdef HAVE_CUDA
    Ptr<BackendNode> initCUDA(int target, csl::Stream stream)
    {
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@ -864,38 +864,6 @@ public:
    }
 #endif  // HAVE_DNN_NGRAPH

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
-        params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
-        if (op == SUM)
-        {
-            std::vector<float> newCoeffs;
-            float offset = zeropoints[1][0];
-            float out_sc = scales[1][0];
-            for (int i = 0; i < scales[0].size(); i++)
-            {
-                float coeff = coeffs.empty() ? 1.f : coeffs[i];
-                float newcoeff = (scales[0][i] * coeff) / out_sc;
-                newCoeffs.push_back(newcoeff);
-                offset -= (newcoeff * zeropoints[0][i]);
-            }
-            params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
-            params.set("offset", offset);
-            return true;
-        }
-        else if (op == PROD)
-        {
-            std::vector<float> newCoeffs = scales[0];
-            newCoeffs[0] /= scales[1][0];
-            params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
-            params.set("offset", zeropoints[1][0]);
-            return true;
-        }
-        return op == MAX;
-    }
-
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@ -244,12 +244,6 @@ public:
    }
 #endif

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return true;
-    }
-
    int _startAxis;
    int _endAxis;
 };
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -796,57 +796,6 @@ public:
    }
 #endif  // HAVE_DNN_NGRAPH

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        if (blobs.empty())
-            return false;
-
-        int numOutput = blobs[0].size[0];
-        float inputScale = scales[0][0], outputScale = scales[1][0];
-        int inputZp = zeropoints[0][0];
-
-        Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
-        Mat biasQuantized(1, numOutput, CV_32S);
-        Mat outputMultiplier(1, numOutput, CV_32F);
-        bool perChannel = params.get<bool>("per_channel", true);
-
-        if (perChannel) // per-Channel quantization.
-        {
-            for (int i = 0; i < numOutput; i++)
-            {
-                double weightsScale = getWeightScale(weightsMat.row(i));
-
-                weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
-                float biasScale = inputScale * weightsScale;
-                biasQuantized.at<int>(i) = cvRound(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
-                outputMultiplier.at<float>(i) = biasScale / outputScale;
-            }
-        }
-        else // per-Tensor quantization.
-        {
-            double weightsScale = getWeightScale(weightsMat);
-
-            weightsMat.convertTo(weightsQuantized, CV_8S, 1.f/weightsScale);
-            float biasScale = inputScale * weightsScale;
-
-            for (int i = 0; i < numOutput; i++)
-            {
-                biasQuantized.at<int>(i) = cvRound(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
-                outputMultiplier.at<float>(i) = biasScale / outputScale;
-            }
-        }
-
-        params.blobs.clear();
-        params.set("per_channel", perChannel);
-        params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
-        params.blobs.push_back(biasQuantized);
-        params.blobs.push_back(outputMultiplier);
-        params.set("input_scale", inputScale);
-        params.set("input_zeropoint", inputZp);
-        return true;
-    }
-
 #ifdef HAVE_WEBNN
    virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@ -880,12 +880,6 @@ public:
    }
 #endif // HAVE_CANN

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return false;
-    }
-
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@ -257,16 +257,6 @@ public:
    }
 #endif

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        float outputScale = scales[1][0];
-        int outputZp = zeropoints[1][0];
-        float padValue = outputZp + std::round(params.get<float>("value", 0)/outputScale);
-        params.set("value", padValue);
-        return true;
-    }
-
 private:
    std::vector<std::pair<int, int> > paddings;  // Pairs pad before, pad after.
    std::vector<Range> dstRanges;
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@ -587,12 +587,6 @@ public:
    }
 #endif // HAVE_TIMVX

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return true;
-    }
-
    // convert OpenCV NCHW order to WHCN order.
    bool getOrderWHCN(std::vector<uint32_t>& orderWHCN)
    {
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -1259,23 +1259,6 @@ public:
        return true;
    }

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        if (type == MAX && !computeMaxIdx)
-        {
-            return true;
-        }
-        else if (type == AVE || type == SUM)
-        {
-            float multiplier = scales[0][0] / scales[1][0];
-            params.set("multiplier", multiplier);
-            params.set("input_zeropoint", zeropoints[0][0]);
-            return true;
-        }
-        return false;
-    }
-
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@ -223,12 +223,6 @@ public:
    }
 #endif

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return true;
-    }
-
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@ -493,12 +493,6 @@ public:
        return Ptr<BackendNode>();
    }

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return true;
-    }
-
 private:
    int axis;
    int numAxes;
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@ -490,12 +490,6 @@ public:
    }
 #endif

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return true;
-    }
-
 protected:
    int outWidth, outHeight;
    const float zoomFactorWidth, zoomFactorHeight;
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@ -365,14 +365,6 @@ public:
        shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
    }

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
-        params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
-        return true;
-    }
-
    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
@ -147,12 +147,6 @@ public:
    }
 #endif

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        return true;
-    }
-
 private:
    Ptr<PermuteLayer> permute;
    std::vector<int> permuteInpShape, permuteOutShape;
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@ -805,18 +805,6 @@ public:
    }
 #endif

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        const int numOutputs = scales[1].size();
-        for (int i = 0; i < numOutputs; i++)
-        {
-            if (scales[1][i] != scales[0][0])
-             return false;
-        }
-        return true;
-    }
-
 private:
    template <typename T>
    void getSliceRecursive(const Mat &inpMat, std::vector<int> &inpIdx,
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@ -294,24 +294,6 @@ public:
    }
 #endif  // HAVE_DNN_NGRAPH

-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        float inpScale = scales[0][0];
-        Mat lookUpTable(1, 256, CV_32F);
-        float* table = lookUpTable.ptr<float>();
-        for (int i = -128; i < 128; i++)
-        {
-            float x = inpScale*(i - 127); // ensures exp(x) is always between (0, 1)
-            table[i+128] = std::exp(x);
-        }
-        params.blobs.clear();
-        params.blobs.push_back(lookUpTable);
-        params.set("input_scale", inpScale);
-        params.set("input_zeropoint", zeropoints[0][0]);
-        return true;
-    }
-
 #ifdef HAVE_WEBNN
    virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
    {
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@ -116,18 +116,6 @@ public:
        return make_cuda_node<cuda4dnn::SplitOp>(preferableTarget, std::move(context->stream));
    }
 #endif
-
-    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
-                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
-    {
-        const int numOutputs = scales[1].size();
-        for (int i = 0; i < numOutputs; i++)
-        {
-            if (scales[1][i] != scales[0][0])
-             return false;
-        }
-        return true;
-    }
 };

 Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)
--- a/modules/dnn/src/net.cpp
+++ b/modules/dnn/src/net.cpp
@ -114,33 +114,6 @@ void Net::forward(std::vector<std::vector<Mat>>& outputBlobs,
    return impl->forward(outputBlobs, outBlobNames);
 }

-// FIXIT drop from inference API
-Net Net::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel)
-{
-    CV_TRACE_FUNCTION();
-    CV_Assert(impl);
-    CV_Assert(!empty());
-    return impl->quantize(*this, calibData, inputsDtype, outputsDtype, perChannel);
-}
-
-// FIXIT drop from inference API
-void Net::getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
-{
-    CV_TRACE_FUNCTION();
-    CV_Assert(impl);
-    CV_Assert(!empty());
-    return impl->getInputDetails(scales, zeropoints);
-}
-
-// FIXIT drop from inference API
-void Net::getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
-{
-    CV_TRACE_FUNCTION();
-    CV_Assert(impl);
-    CV_Assert(!empty());
-    return impl->getOutputDetails(scales, zeropoints);
-}
-
 void Net::setPreferableBackend(int backendId)
 {
    CV_TRACE_FUNCTION();
--- a/modules/dnn/src/net_impl.hpp
+++ b/modules/dnn/src/net_impl.hpp
@ -273,11 +273,6 @@ struct Net::Impl : public detail::NetImplBase

    void dumpNetworkToFile() const;

-    // FIXIT drop from inference API
-    Net quantize(Net& net, InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel) /*const*/;
-    void getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/;
-    void getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/;
-
 };  // Net::Impl


--- a/modules/dnn/src/net_quantization.cpp
+++ b/modules/dnn/src/net_quantization.cpp
@ -1,304 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "precomp.hpp"
-
-#include "net_impl.hpp"
-
-namespace cv {
-namespace dnn {
-CV__DNN_INLINE_NS_BEGIN
-
-
-// FIXIT drop from inference API
-static
-void getQuantizationParams(const Mat& src, std::vector<float>& scales, std::vector<int>& zeropoints)
-{
-    const int qmin = -128; // INT8_MIN
-    const int qmax = 127;  // INT8_MAX
-
-    double rmin, rmax, sc, zp;
-    cv::minMaxIdx(src, &rmin, &rmax);
-
-    // 0 must be present in the range [rmin, rmax]
-    rmin = std::min(rmin, 0.0);
-    rmax = std::max(rmax, 0.0);
-
-    sc = (rmax == rmin) ? 1.0 : (rmax - rmin)/(qmax - qmin);
-    zp = qmin - (rmin/sc);
-
-    scales.push_back((float)sc);
-    zeropoints.push_back((int)std::round(zp));
-}
-
-// FIXIT drop from inference API
-Net Net::Impl::quantize(Net& net, InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel)
-{
-    // Net can be quantized only once.
-    if (netWasQuantized)
-        CV_Error(Error::StsBadArg, "Cannot quantize a quantized net");
-
-    CV_CheckType(inputsDtype, inputsDtype == CV_32F || inputsDtype == CV_8S, "Input depth should be CV_32F or CV_8S");
-    CV_CheckType(outputsDtype, outputsDtype == CV_32F || outputsDtype == CV_8S, "Output depth should be CV_32F or CV_8S");
-
-    bool originalFusion = fusion;
-    int prefBackend = preferableBackend;
-    int prefTarget = preferableTarget;
-
-    // Disable fusions and use CPU backend to quantize net
-    // FIXIT: we should not modify original network!
-    setPreferableBackend(net, DNN_BACKEND_OPENCV);
-    setPreferableTarget(DNN_TARGET_CPU);
-    enableFusion(false);
-    enableWinograd(false);
-
-    if (calibData.isMat())
-    {
-        setInput(calibData.getMat(), /*name=*/"", /*scalefactor=*/1.0, /*mean=*/Scalar());
-    }
-    else if (calibData.isMatVector())
-    {
-        std::vector<Mat> calibDataVec;
-        calibData.getMatVector(calibDataVec);
-
-        std::vector<String> inpNames = netInputLayer->outNames;
-        CV_CheckEQ(calibDataVec.size(), inpNames.size(), "Calibration data size should be equal to number of inputs");
-        for (int i = 0; i < calibDataVec.size(); i++)
-            setInput(calibDataVec[i], inpNames[i], /*scalefactor=*/1.0, /*mean=*/Scalar());
-    }
-
-    std::vector<String> outNames = getUnconnectedOutLayersNames();
-    std::vector<LayerPin> pins;
-    for (int i = 0; i < outNames.size(); i++)
-        pins.push_back(getPinByAlias(outNames[i]));
-    setUpNet(pins);
-
-    // Compute scales and zeropoints for all the layers
-    std::vector<std::vector<float> > scales;
-    std::vector<std::vector<int> > zeropoints;
-    for (Impl::MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); it++)
-    {
-        LayerData& ld = it->second;
-        if (!ld.skip)
-        {
-            Ptr<Layer> layer = ld.layerInstance;
-            std::vector<Mat> inps(ld.inputBlobs.size());
-            for (int i = 0; i < ld.inputBlobs.size(); ++i)
-                inps[i] = *ld.inputBlobs[i];
-            layer->forward(inps, ld.outputBlobs, ld.internals);
-        }
-
-        std::vector<float> sc;
-        std::vector<int> zp;
-        if (ld.type == "TanH")
-        {
-            sc.push_back(1.f/128);
-            zp.push_back(0);
-        }
-        else if (ld.type == "Sigmoid" || ld.type == "Softmax" || ld.type == "SoftMax")
-        {
-            if (ld.params.get<bool>("log_softmax", false))
-            {
-                sc.push_back(16.f/256);
-                zp.push_back(127);
-            }
-            else
-            {
-                sc.push_back(1.f/256);
-                zp.push_back(-128);
-            }
-        }
-        else if (ld.type == "Split" || ld.type == "Slice" || ld.type == "Crop")
-        {
-            std::vector<float> inp_sc; std::vector<int> inp_zp;
-            getQuantizationParams(*ld.inputBlobs[0], inp_sc, inp_zp);
-            sc.assign(ld.outputBlobs.size(), inp_sc[0]);
-            zp.assign(ld.outputBlobs.size(), inp_zp[0]);
-        }
-        else
-        {
-            for (int i = 0; i < ld.outputBlobs.size(); i++)
-                getQuantizationParams(ld.outputBlobs[i], sc, zp);
-        }
-        scales.push_back(sc);
-        zeropoints.push_back(zp);
-    }
-
-    // For some layers, the input and output scales/zeropoints must be equal so that rescaling of inputs
-    // is not needed during quantized inference. We start from the last layer and modify the layer's input scales/zeropoints
-    // TODO : Need a different approach. Current solution fails when 2 such layers have the same input layer
-    for (Impl::MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
-    {
-        LayerData& ld = it->second;
-        // Layers with multiple outputs. Number of outputs is equal to number of inputs
-        if (ld.type == "Blank" || ld.type == "Dropout" || ld.type == "Identity" || ld.type == "Silence" ||
-            ld.type == "Flatten" || ld.type == "Padding" || ld.type == "Permute" || ld.type == "Reshape" ||
-            ld.type == "ReLU6" || ld.type == "Reorg" || ld.type == "ShuffleChannel" || ld.type == "Resize" ||
-           (ld.type == "ReLU" && !ld.params.get<float>("negative_slope", 0.f)) || /* ReLU with negative slope 0 */
-           (ld.type == "Reduce" && (toLowerCase(ld.params.get<String>("reduce")) == "max" ||
-            toLowerCase(ld.params.get<String>("reduce")) == "min")))
-        {
-            for (int i = 0; i < ld.outputBlobs.size(); i++)
-            {
-                LayerPin &pin = ld.inputBlobsId[i];
-                scales[pin.lid][pin.oid] = scales[ld.id][i];
-                zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][i];
-            }
-        }
-        // Layers with multiple inputs and single output.
-        else if ((ld.type == "Pooling" && toLowerCase(ld.params.get<String>("pool", "max")) == "max") /* Max Pooling */ ||
-                 (ld.type == "Eltwise" && toLowerCase(ld.params.get<String>("operation", "sum")) == "max") /* Elementwise max */ ||
-                  ld.type == "Concat")
-        {
-            for (int i = 0; i < ld.inputBlobsId.size(); i++)
-            {
-                LayerPin &pin = ld.inputBlobsId[i];
-                scales[pin.lid][pin.oid] = scales[ld.id][0];
-                zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][0];
-            }
-        }
-    }
-
-    // Create a new Net and add quantized layers to it.
-    Net dstNet_;
-    Net::Impl& dstNet = *(dstNet_.impl);
-    dstNet.netWasQuantized = true;
-    dstNet.setInputsNames(netInputLayer->outNames);
-    dstNet.setPreferableBackend(dstNet_, prefBackend);
-    dstNet.setPreferableTarget(prefTarget);
-    dstNet.enableFusion(originalFusion);
-
-    for (Impl::MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); it++)
-    {
-        LayerData ld = it->second;
-        if (ld.id == 0)
-        {
-            LayerData &quantInpLd = dstNet.layers[0];
-            quantInpLd.dtype = inputsDtype;
-            quantInpLd.params.set("scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
-            quantInpLd.params.set("zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
-            continue;
-        }
-
-        std::vector<LayerPin> inpPins = ld.inputBlobsId;
-        // Fill input and output scales/zeropoints for the layer
-        std::vector<std::vector<float> > inp_out_sc(2);
-        std::vector<std::vector<int> > inp_out_zp(2);
-        for (int i = 0; i < inpPins.size(); i++)
-        {
-            LayerPin &pin = inpPins[i];
-            inp_out_sc[0].push_back(scales[pin.lid][pin.oid]);
-            inp_out_zp[0].push_back(zeropoints[pin.lid][pin.oid]);
-        }
-        inp_out_sc[1] = scales[ld.id];
-        inp_out_zp[1] = zeropoints[ld.id];
-
-        // Set the quantization type, per-tensor quantize or per-channel quantize.
-        // Especially for Convolution layer and Fully connection layer.
-        ld.params.set("per_channel", perChannel);
-
-        // Quantize layer
-        Ptr<Layer> layer = ld.layerInstance;
-        if (layer->tryQuantize(inp_out_sc, inp_out_zp, ld.params))
-        {
-            ld.type += "Int8";
-            ld.dtype = CV_8S;
-        }
-        ld.params.set("scales", DictValue::arrayReal(inp_out_sc[1].data(), inp_out_sc[1].size()));
-        ld.params.set("zeropoints", DictValue::arrayInt(inp_out_zp[1].data(), inp_out_zp[1].size()));
-
-        // Check and add quantize/dequantize node before layer
-        for (int i = 0; i < inpPins.size(); i++)
-        {
-            LayerPin &pin = inpPins[i];
-            LayerData &inpLd = dstNet.getLayerData(getLayerName(pin.lid));
-            pin.lid = inpLd.id;
-            if (inpLd.dtype != ld.dtype)
-            {
-                String layerName = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? cv::format("quantize/%s/%d", inpLd.name.c_str(), pin.oid)
-                                                                                : cv::format("dequantize/%s/%d", inpLd.name.c_str(), pin.oid);
-                // Check if quantize/dequantize node for the input layer already exists
-                if (dstNet.getLayerId(layerName) >= 0)
-                {
-                    pin.lid = dstNet.getLayerId(layerName);
-                    pin.oid = 0;
-                }
-                else
-                {
-                    LayerParams lp;
-                    lp.set("scales", inp_out_sc[0][i]);
-                    lp.set("zeropoints", inp_out_zp[0][i]);
-                    lp.name = layerName;
-                    lp.type = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? "Quantize" : "Dequantize";
-                    int newLid = dstNet.addLayer(lp.name, lp.type, ld.dtype, lp);
-                    dstNet.connect(pin.lid, pin.oid, newLid, 0);
-                    pin.lid = newLid; pin.oid = 0;
-                }
-            }
-        }
-
-        // Add quantized layer to Net and connect to its inputs.
-        int newLid = dstNet.addLayer(ld.name, ld.type, ld.dtype, ld.params);
-        for( int i = 0; i < inpPins.size(); i++ )
-            dstNet.connect(inpPins[i].lid, inpPins[i].oid, newLid, i);
-
-        // If the layer is a output layer, add quantize/dequantize node after it based on output's data type.
-        if (ld.requiredOutputs.size() == 0 && ld.dtype != outputsDtype)
-        {
-            LayerParams lp;
-            lp.set("scales", inp_out_sc[1][0]);
-            lp.set("zeropoints", inp_out_zp[1][0]);
-            lp.name = ((ld.dtype == CV_32F && outputsDtype == CV_8S) ? "quantize/" : "dequantize/") + ld.name;
-            lp.type = (ld.dtype == CV_32F && outputsDtype == CV_8S) ? "Quantize" : "Dequantize";
-            dstNet.addLayerToPrev(lp.name, lp.type, outputsDtype, lp);
-        }
-    }
-    // Restore FP32 Net's backend, target and fusion
-    setPreferableBackend(net, prefBackend);
-    setPreferableTarget(prefTarget);
-    enableFusion(originalFusion);
-    return dstNet_;
-}
-
-// FIXIT drop from inference API
-void Net::Impl::getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/
-{
-    if (!netWasQuantized)
-        CV_Error(Error::StsBadFunc, "Net isn't quantized");
-
-    LayerParams &lp = layers[0].params;
-    DictValue sc = lp.get("scales");
-    DictValue zp = lp.get("zeropoints");
-
-    for (int i = 0; i < sc.size(); i++)
-    {
-        scales.push_back(sc.get<float>(i));
-        zeropoints.push_back(zp.get<int>(i));
-    }
-}
-
-// FIXIT drop from inference API
-void Net::Impl::getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) /*const*/
-{
-    if (!netWasQuantized)
-        CV_Error(Error::StsBadFunc, "Net isn't quantized");
-
-    std::vector<int> outLayerIds = getUnconnectedOutLayers();
-    for (auto &lid : outLayerIds)
-    {
-        LayerParams &lp = layers[lid].params;
-        DictValue sc = lp.get("scales");
-        DictValue zp = lp.get("zeropoints");
-
-        for (int i = 0; i < sc.size(); i++)
-        {
-            scales.push_back(sc.get<float>(i));
-            zeropoints.push_back(zp.get<int>(i));
-        }
-    }
-}
-
-
-CV__DNN_INLINE_NS_END
-}}  // namespace cv::dnn
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@ -2,6 +2,10 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.

+// The tests are disabled, because on-fly quantization was removed in https://github.com/opencv/opencv/pull/24980
+// To be restored, when test models are quantized outsize of OpenCV
+#if 0
+
 #include "test_precomp.hpp"
 #include "npy_blob.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
@ -1389,3 +1393,5 @@ TEST_P(Test_Int8_nets, YOLOv4_tiny)
 INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_nets, dnnBackendsAndTargetsInt8());

 }} // namespace
+
+#endif // #if 0