diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 70f9a5a73e..c1236c4653 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -9,6 +9,7 @@ endif()
 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
 
 ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
+ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
 
 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
 
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 794bfeedda..fbe16850d4 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -258,6 +258,14 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
+    {
+    public:
+        int input_zp, output_zp;
+        float output_sc;
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer
     {
     public:
@@ -300,6 +308,13 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<PoolingLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS PoolingLayerInt8 : public PoolingLayer
+    {
+    public:
+        int input_zp, output_zp;
+        static Ptr<PoolingLayerInt8> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS SoftmaxLayer : public Layer
     {
     public:
@@ -308,6 +323,14 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<SoftmaxLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS SoftmaxLayerInt8 : public SoftmaxLayer
+    {
+    public:
+        float output_sc;
+        int output_zp;
+        static Ptr<SoftmaxLayerInt8> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS InnerProductLayer : public Layer
     {
     public:
@@ -315,6 +338,13 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<InnerProductLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS InnerProductLayerInt8 : public InnerProductLayer
+    {
+    public:
+        int output_zp;
+        static Ptr<InnerProductLayerInt8> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS MVNLayer : public Layer
     {
     public:
@@ -341,6 +371,22 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<FlattenLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS QuantizeLayer : public Layer
+    {
+    public:
+        float scale;
+        int zeropoint;
+        static Ptr<QuantizeLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS DequantizeLayer : public Layer
+    {
+    public:
+        float scale;
+        int zeropoint;
+        static Ptr<DequantizeLayer> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS ConcatLayer : public Layer
     {
     public:
@@ -352,6 +398,7 @@ CV__DNN_INLINE_NS_BEGIN
          * Details: https://github.com/torch/nn/blob/master/doc/containers.md#depthconcat
          */
         bool padding;
+        int paddingValue;
 
         static Ptr<ConcatLayer> create(const LayerParams &params);
     };
@@ -459,7 +506,11 @@ CV__DNN_INLINE_NS_BEGIN
     {
     public:
         virtual void forwardSlice(const float* src, float* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const = 0;
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
+        virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
+        virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
     };
 
     class CV_EXPORTS ReLULayer : public ActivationLayer
@@ -542,6 +593,12 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<ExpLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS ActivationLayerInt8 : public ActivationLayer
+    {
+    public:
+        static Ptr<ActivationLayerInt8> create(const LayerParams &params);
+    };
+
     /* Layers used in semantic segmentation */
 
     class CV_EXPORTS CropLayer : public Layer
@@ -563,6 +620,12 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<EltwiseLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS EltwiseLayerInt8 : public Layer
+    {
+    public:
+        static Ptr<EltwiseLayerInt8> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS BatchNormLayer : public ActivationLayer
     {
     public:
@@ -572,6 +635,14 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<BatchNormLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS BatchNormLayerInt8 : public BatchNormLayer
+    {
+    public:
+        float input_sc, output_sc;
+        int input_zp, output_zp;
+        static Ptr<BatchNormLayerInt8> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS MaxUnpoolLayer : public Layer
     {
     public:
@@ -591,12 +662,26 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<ScaleLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS ScaleLayerInt8 : public ScaleLayer
+    {
+    public:
+        float output_sc;
+        int output_zp;
+        static Ptr<ScaleLayerInt8> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS ShiftLayer : public Layer
     {
     public:
         static Ptr<Layer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS ShiftLayerInt8 : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS DataAugmentationLayer : public Layer
     {
     public:
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index a498039f65..bf1670051a 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -235,6 +235,15 @@ CV__DNN_INLINE_NS_BEGIN
          */
         virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
 
+        /** @brief Tries to quantize the given layer and compute the quantization parameters required for fixed point implementation.
+         *  @param[in] scales input and output scales.
+         *  @param[in] zeropoints input and output zeropoints.
+         *  @param[out] params Quantized parameters required for fixed point implementation of that layer.
+         *  @returns True if layer can be quantized.
+         */
+        virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                                 const std::vector<std::vector<int> > &zeropoints, LayerParams& params);
+
         /** @brief Given the @p input blobs, computes the output @p blobs.
          *  @param[in]  inputs  the input blobs.
          *  @param[out] outputs allocated output blobs, which will store results of the computation.
@@ -368,6 +377,16 @@ CV__DNN_INLINE_NS_BEGIN
          */
         virtual void getScaleShift(Mat& scale, Mat& shift) const;
 
+        /**
+         * @brief Returns scale and zeropoint of layers
+         * @param[out] scale Output scale
+         * @param[out] zeropoint Output zeropoint
+         *
+         * By default, @p scale is 1 and @p zeropoint is 0.
+         */
+        virtual void getScaleZeropoint(float& scale, int& zeropoint) const;
+
+
         /**
          * @brief "Deattaches" all the layers, attached to particular layer.
          */
@@ -453,13 +472,21 @@ CV__DNN_INLINE_NS_BEGIN
         /** @brief Adds new layer to the net.
          *  @param name   unique name of the adding layer.
          *  @param type   typename of the adding layer (type must be registered in LayerRegister).
+         *  @param dtype  datatype of output blobs.
          *  @param params parameters which will be used to initialize the creating layer.
          *  @returns unique identifier of created layer, or -1 if a failure will happen.
          */
+        int addLayer(const String &name, const String &type, const int &dtype, LayerParams &params);
+
+        /** @overload Datatype of output blobs set to default CV_32F */
         int addLayer(const String &name, const String &type, LayerParams &params);
+
         /** @brief Adds new layer and connects its first input to the first output of previously added layer.
          *  @see addLayer()
          */
+        int addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params);
+
+        /** @overload */
         int addLayerToPrev(const String &name, const String &type, LayerParams &params);
 
         /** @brief Converts string name of the layer to the integer identifier.
@@ -551,6 +578,25 @@ CV__DNN_INLINE_NS_BEGIN
         CV_WRAP_AS(forwardAndRetrieve) void forward(CV_OUT std::vector<std::vector<Mat> >& outputBlobs,
                                                     const std::vector<String>& outBlobNames);
 
+        /** @brief Returns a quantized Net from a floating-point Net.
+         *  @param calibData Calibration data to compute the quantization parameters.
+         *  @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S.
+         *  @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S.
+         */
+        CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype);
+
+        /** @brief Returns input scale and zeropoint for a quantized Net.
+         *  @param scales output parameter for returning input scales.
+         *  @param zeropoints output parameter for returning input zeropoints.
+         */
+        CV_WRAP void getInputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
+
+        /** @brief Returns output scale and zeropoint for a quantized Net.
+         *  @param scales output parameter for returning output scales.
+         *  @param zeropoints output parameter for returning output zeropoints.
+         */
+        CV_WRAP void getOutputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
+
         /**
          * @brief Compile Halide layers.
          * @param[in] scheduler Path to YAML file with scheduling directives.
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 52a5fcba28..492ad166d0 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -574,9 +574,9 @@ struct LayerPin
 
 struct LayerData
 {
-    LayerData() : id(-1), skip(false), flag(0) {}
-    LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
-        : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
+    LayerData() : id(-1), dtype(CV_32F), skip(false), flag(0) {}
+    LayerData(int _id, const String &_name, const String &_type, const int &_dtype, LayerParams &_params)
+        : id(_id), name(_name), type(_type), dtype(_dtype), params(_params), skip(false), flag(0)
     {
         CV_TRACE_FUNCTION();
 
@@ -588,6 +588,7 @@ struct LayerData
     int id;
     String name;
     String type;
+    int dtype; // Datatype of output blobs.
     LayerParams params;
 
     std::vector<LayerPin> inputBlobsId;
@@ -944,7 +945,7 @@ public:
         }
     }
 
-    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, const int& dtype)
     {
         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
         {
@@ -966,7 +967,8 @@ public:
                 {
                     Mat& unusedBlob = hostIt->second;
                     if (unusedBlob.total() >= targetTotal &&
-                        unusedBlob.total() < bestBlobTotal)
+                        unusedBlob.total() < bestBlobTotal &&
+                        unusedBlob.type() == dtype)
                     {
                         bestBlobPin = hostIt->first;
                         bestBlob = unusedBlob;
@@ -985,14 +987,13 @@ public:
         {
             // if dst already has been allocated with total(shape) elements,
             // it won't be recreated and pointer of dst.data remains the same.
-            dst.create(shape, use_half ? CV_16S : CV_32F);
+            dst.create(shape, dtype);
             addHost(lp, dst);
         }
     }
 
     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
-                               std::vector<LayerPin>& pinsForInternalBlobs,
-                               bool use_half = false)
+                               std::vector<LayerPin>& pinsForInternalBlobs)
     {
         CV_TRACE_FUNCTION();
 
@@ -1063,7 +1064,7 @@ public:
                         reuse(ld.inputBlobsId[0], blobPin);
                     }
                     else
-                        reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index], ld.dtype);
                 }
             }
         }
@@ -1193,6 +1194,7 @@ struct Net::Impl : public detail::NetImplBase
 
         lastLayerId = 0;
         netWasAllocated = false;
+        netWasQuantized = false;
         fusion = true;
         isAsync = false;
         preferableBackend = DNN_BACKEND_DEFAULT;
@@ -1217,6 +1219,7 @@ struct Net::Impl : public detail::NetImplBase
     int lastLayerId;
 
     bool netWasAllocated;
+    bool netWasQuantized;
     bool fusion;
     bool isAsync;
     std::vector<int64> layersTimings;
@@ -1372,7 +1375,7 @@ struct Net::Impl : public detail::NetImplBase
 
             currLayer->unsetAttached();
         }
-
+        netWasAllocated = false;
         layersTimings.clear();
     }
 
@@ -2541,10 +2544,11 @@ struct Net::Impl : public detail::NetImplBase
 
         CV_Assert(layerShapesIt != layersShapes.end());
 
+        if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_OPENCL_FP16 && ld.dtype == CV_32F)
+            ld.dtype = CV_16S;
+
         std::vector<LayerPin> pinsForInternalBlobs;
-        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
-                                          preferableBackend == DNN_BACKEND_OPENCV &&
-                                          preferableTarget == DNN_TARGET_OPENCL_FP16);
+        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
         for (int i = 0; i < ld.outputBlobs.size(); ++i)
             ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
@@ -3148,7 +3152,8 @@ struct Net::Impl : public detail::NetImplBase
             Mat& inp = layers[0].outputBlobs[i];
             CV_Assert(inp.total());
             if (preferableBackend == DNN_BACKEND_OPENCV &&
-                preferableTarget == DNN_TARGET_OPENCL_FP16)
+                preferableTarget == DNN_TARGET_OPENCL_FP16 &&
+                layers[0].dtype == CV_32F)
             {
                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
             }
@@ -3458,6 +3463,25 @@ struct Net::Impl : public detail::NetImplBase
 #endif
     }
 
+    void getQuantizationParams(const Mat& src, std::vector<float>& scales, std::vector<int>& zeropoints)
+    {
+        const int qmin = -128; // INT8_MIN
+        const int qmax = 127;  // INT8_MAX
+
+        double rmin, rmax, sc, zp;
+        cv::minMaxIdx(src, &rmin, &rmax);
+
+        // 0 must be present in the range [rmin, rmax]
+        rmin = std::min(rmin, 0.0);
+        rmax = std::max(rmax, 0.0);
+
+        sc = (rmax == rmin) ? 1.0 : (rmax - rmin)/(qmax - qmin);
+        zp = qmin - (rmin/sc);
+
+        scales.push_back((float)sc);
+        zeropoints.push_back((int)std::round(zp));
+    }
+
     void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
     {
         std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
@@ -3588,7 +3612,8 @@ struct Net::Impl : public detail::NetImplBase
             Mat& inp = layers[0].outputBlobs[i];
             CV_Assert(inp.total());
             if (preferableBackend == DNN_BACKEND_OPENCV &&
-                preferableTarget == DNN_TARGET_OPENCL_FP16)
+                preferableTarget == DNN_TARGET_OPENCL_FP16 &&
+                layers[0].dtype == CV_32F)
             {
                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
             }
@@ -3614,7 +3639,7 @@ struct Net::Impl : public detail::NetImplBase
                     const MatShape& shape = layersShapes[inputLayerId].out[inputLayerIds[i].oid];
                     layersShapes[layerId].in.push_back(shape);
                 }
-                it->second.layerInstance->updateMemoryShapes(layersShapes[layerId].in);
+                it->second.getLayerInstance()->updateMemoryShapes(layersShapes[layerId].in);
             }
         }
     }
@@ -4019,7 +4044,7 @@ Net::~Net()
 {
 }
 
-int Net::addLayer(const String &name, const String &type, LayerParams &params)
+int Net::addLayer(const String &name, const String &type, const int &dtype, LayerParams &params)
 {
     CV_TRACE_FUNCTION();
 
@@ -4042,23 +4067,35 @@ int Net::addLayer(const String &name, const String &type, LayerParams &params)
 
     id = ++impl->lastLayerId;
     impl->layerNameToId.insert(std::make_pair(name, id));
-    impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
+    impl->layers.insert(std::make_pair(id, LayerData(id, name, type, dtype, params)));
     if (params.get<bool>("has_dynamic_shapes", false))
         impl->hasDynamicShapes = true;
 
     return id;
 }
 
-int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
+int Net::addLayer(const String &name, const String &type, LayerParams &params)
+{
+    CV_TRACE_FUNCTION();
+    return addLayer(name, type, CV_32F, params);
+}
+
+int Net::addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params)
 {
     CV_TRACE_FUNCTION();
 
     int prvLid = impl->lastLayerId;
-    int newLid = this->addLayer(name, type, params);
+    int newLid = this->addLayer(name, type, dtype, params);
     this->connect(prvLid, 0, newLid, 0);
     return newLid;
 }
 
+int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
+{
+    CV_TRACE_FUNCTION();
+    return addLayerToPrev(name, type, CV_32F, params);
+}
+
 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
 {
     CV_TRACE_FUNCTION();
@@ -4169,16 +4206,19 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
                 ld.outputBlobsWrappers[i]->copyToHost();
             }
         }
-        if (ld.outputBlobs[0].depth() == CV_32F)
+        if (ld.outputBlobs[0].depth() == CV_16S)
         {
-            std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
-            outputvec = ld.outputBlobs;
-        } else {
             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
             outputvec.resize(ld.outputBlobs.size());
             for (int i = 0; i < outputvec.size(); i++)
                 convertFp16(ld.outputBlobs[i], outputvec[i]);
         }
+        else
+        {
+            // Output depth can be CV_32F or CV_8S
+            std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
+            outputvec = ld.outputBlobs;
+        }
     }
     else if (outputBlobs.isUMatVector())
     {
@@ -4264,11 +4304,277 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
     }
 }
 
+Net Net::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype)
+{
+    CV_TRACE_FUNCTION();
+
+    // Net can be quantized only once.
+    if (impl->netWasQuantized)
+        CV_Error(Error::StsBadArg, "Cannot quantize a quantized net");
+
+    CV_CheckType(inputsDtype, inputsDtype == CV_32F || inputsDtype == CV_8S, "Input depth should be CV_32F or CV_8S");
+    CV_CheckType(outputsDtype, outputsDtype == CV_32F || outputsDtype == CV_8S, "Output depth should be CV_32F or CV_8S");
+
+    bool originalFusion = impl->fusion;
+    int prefBackend = impl->preferableBackend;
+    int prefTarget = impl->preferableTarget;
+
+    // Disable fusions and use CPU backend to quantize net
+    setPreferableBackend(DNN_BACKEND_OPENCV);
+    setPreferableTarget(DNN_TARGET_CPU);
+    enableFusion(false);
+
+    if (calibData.isMat())
+    {
+        setInput(calibData.getMat());
+    }
+    else if (calibData.isMatVector())
+    {
+        std::vector<Mat> calibDataVec;
+        calibData.getMatVector(calibDataVec);
+
+        std::vector<String> inpNames = impl->netInputLayer->outNames;
+        CV_CheckEQ(calibDataVec.size(), inpNames.size(), "Calibration data size should be equal to number of inputs");
+        for (int i = 0; i < calibDataVec.size(); i++)
+            setInput(calibDataVec[i], inpNames[i]);
+    }
+
+    std::vector<String> outNames = getUnconnectedOutLayersNames();
+    std::vector<LayerPin> pins;
+    for (int i = 0; i < outNames.size(); i++)
+        pins.push_back(impl->getPinByAlias(outNames[i]));
+    impl->setUpNet(pins);
+
+    // Compute scales and zeropoints for all the layers
+    std::vector<std::vector<float> > scales;
+    std::vector<std::vector<int> > zeropoints;
+    for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++)
+    {
+        LayerData& ld = it->second;
+        if (!ld.skip)
+        {
+            Ptr<Layer> layer = ld.layerInstance;
+            std::vector<Mat> inps(ld.inputBlobs.size());
+            for (int i = 0; i < ld.inputBlobs.size(); ++i)
+                inps[i] = *ld.inputBlobs[i];
+            layer->forward(inps, ld.outputBlobs, ld.internals);
+        }
+
+        std::vector<float> sc;
+        std::vector<int> zp;
+        if (ld.type == "TanH")
+        {
+            sc.push_back(1.f/128);
+            zp.push_back(0);
+        }
+        else if (ld.type == "Sigmoid" || ld.type == "Softmax" || ld.type == "SoftMax")
+        {
+            if (ld.params.get<bool>("log_softmax", false))
+            {
+                sc.push_back(16.f/256);
+                zp.push_back(127);
+            }
+            else
+            {
+                sc.push_back(1.f/256);
+                zp.push_back(-128);
+            }
+        }
+        else if (ld.type == "Split" || ld.type == "Slice" || ld.type == "Crop")
+        {
+            std::vector<float> inp_sc; std::vector<int> inp_zp;
+            impl->getQuantizationParams(*ld.inputBlobs[0], inp_sc, inp_zp);
+            sc.assign(ld.outputBlobs.size(), inp_sc[0]);
+            zp.assign(ld.outputBlobs.size(), inp_zp[0]);
+        }
+        else
+        {
+            for (int i = 0; i < ld.outputBlobs.size(); i++)
+                impl->getQuantizationParams(ld.outputBlobs[i], sc, zp);
+        }
+        scales.push_back(sc);
+        zeropoints.push_back(zp);
+    }
+
+    // For some layers, the input and output scales/zeropoints must be equal so that rescaling of inputs
+    // is not needed during quantized inference. We start from the last layer and modify the layer's input scales/zeropoints
+    // TODO : Need a different approach. Current solution fails when 2 such layers have the same input layer
+    for (Impl::MapIdToLayerData::reverse_iterator it = impl->layers.rbegin(); it != impl->layers.rend(); ++it)
+    {
+        LayerData& ld = it->second;
+        // Layers with multiple outputs. Number of outputs is equal to number of inputs
+        if (ld.type == "Blank" || ld.type == "Dropout" || ld.type == "Identity" || ld.type == "Silence" ||
+            ld.type == "Flatten" || ld.type == "Padding" || ld.type == "Permute" || ld.type == "Reshape" ||
+            ld.type == "ReLU6" || ld.type == "Reorg" || ld.type == "ShuffleChannel" ||
+           (ld.type == "ReLU" && !ld.params.get<float>("negative_slope", 0.f)) /* ReLU with negative slope 0 */)
+        {
+            for (int i = 0; i < ld.outputBlobs.size(); i++)
+            {
+                LayerPin &pin = ld.inputBlobsId[i];
+                scales[pin.lid][pin.oid] = scales[ld.id][i];
+                zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][i];
+            }
+        }
+        // Layers with multiple inputs and single output.
+        else if ((ld.type == "Pooling" && toLowerCase(ld.params.get<String>("pool", "max")) == "max") /* Max Pooling */ ||
+                 (ld.type == "Eltwise" && toLowerCase(ld.params.get<String>("operation", "sum")) == "max") /* Elementwise max */ ||
+                  ld.type == "Concat")
+        {
+            for (int i = 0; i < ld.inputBlobsId.size(); i++)
+            {
+                LayerPin &pin = ld.inputBlobsId[i];
+                scales[pin.lid][pin.oid] = scales[ld.id][0];
+                zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][0];
+            }
+        }
+    }
+
+    // Create a new Net and add quantized layers to it.
+    Net dstNet;
+    dstNet.impl->netWasQuantized = true;
+    dstNet.setInputsNames(impl->netInputLayer->outNames);
+    dstNet.setPreferableBackend(prefBackend);
+    dstNet.setPreferableTarget(prefTarget);
+    dstNet.enableFusion(originalFusion);
+
+    for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++)
+    {
+        LayerData ld = it->second;
+        if (ld.id == 0)
+        {
+            LayerData &quantInpLd = dstNet.impl->layers[0];
+            quantInpLd.dtype = inputsDtype;
+            quantInpLd.params.set("scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
+            quantInpLd.params.set("zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+            continue;
+        }
+
+        std::vector<LayerPin> inpPins = ld.inputBlobsId;
+        // Fill input and output scales/zeropoints for the layer
+        std::vector<std::vector<float> > inp_out_sc(2);
+        std::vector<std::vector<int> > inp_out_zp(2);
+        for (int i = 0; i < inpPins.size(); i++)
+        {
+            LayerPin &pin = inpPins[i];
+            inp_out_sc[0].push_back(scales[pin.lid][pin.oid]);
+            inp_out_zp[0].push_back(zeropoints[pin.lid][pin.oid]);
+        }
+        inp_out_sc[1] = scales[ld.id];
+        inp_out_zp[1] = zeropoints[ld.id];
+
+        // Quantize layer
+        Ptr<Layer> layer = ld.layerInstance;
+        if (layer->tryQuantize(inp_out_sc, inp_out_zp, ld.params))
+        {
+            ld.type += "Int8";
+            ld.dtype = CV_8S;
+        }
+        ld.params.set("scales", DictValue::arrayReal(inp_out_sc[1].data(), inp_out_sc[1].size()));
+        ld.params.set("zeropoints", DictValue::arrayInt(inp_out_zp[1].data(), inp_out_zp[1].size()));
+
+        // Check and add quantize/dequantize node before layer
+        for (int i = 0; i < inpPins.size(); i++)
+        {
+            LayerPin &pin = inpPins[i];
+            LayerData &inpLd = dstNet.impl->getLayerData(impl->getLayerName(pin.lid));
+            pin.lid = inpLd.id;
+            if (inpLd.dtype != ld.dtype)
+            {
+                String layerName = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? cv::format("quantize/%s/%d", inpLd.name.c_str(), pin.oid)
+                                                                                : cv::format("dequantize/%s/%d", inpLd.name.c_str(), pin.oid);
+                // Check if quantize/dequantize node for the input layer already exists
+                if (dstNet.impl->getLayerId(layerName) >= 0)
+                {
+                    pin.lid = dstNet.impl->getLayerId(layerName);
+                    pin.oid = 0;
+                }
+                else
+                {
+                    LayerParams lp;
+                    lp.set("scales", inp_out_sc[0][i]);
+                    lp.set("zeropoints", inp_out_zp[0][i]);
+                    lp.name = layerName;
+                    lp.type = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? "Quantize" : "Dequantize";
+                    int newLid = dstNet.addLayer(lp.name, lp.type, ld.dtype, lp);
+                    dstNet.connect(pin.lid, pin.oid, newLid, 0);
+                    pin.lid = newLid; pin.oid = 0;
+                }
+            }
+        }
+
+        // Add quantized layer to Net and connect to its inputs.
+        int newLid = dstNet.addLayer(ld.name, ld.type, ld.dtype, ld.params);
+        for( int i = 0; i < inpPins.size(); i++ )
+            dstNet.connect(inpPins[i].lid, inpPins[i].oid, newLid, i);
+
+        // If the layer is a output layer, add quantize/dequantize node after it based on output's data type.
+        if (ld.requiredOutputs.size() == 0 && ld.dtype != outputsDtype)
+        {
+            LayerParams lp;
+            lp.set("scales", inp_out_sc[1][0]);
+            lp.set("zeropoints", inp_out_zp[1][0]);
+            lp.name = ((ld.dtype == CV_32F && outputsDtype == CV_8S) ? "quantize/" : "dequantize/") + ld.name;
+            lp.type = (ld.dtype == CV_32F && outputsDtype == CV_8S) ? "Quantize" : "Dequantize";
+            dstNet.addLayerToPrev(lp.name, lp.type, outputsDtype, lp);
+        }
+    }
+    // Restore FP32 Net's backend, target and fusion
+    setPreferableBackend(prefBackend);
+    setPreferableTarget(prefTarget);
+    enableFusion(originalFusion);
+    return dstNet;
+}
+
+void Net::getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
+{
+    if (!impl->netWasQuantized)
+        CV_Error(Error::StsBadFunc, "Net isn't quantized");
+
+    LayerParams &lp = impl->layers[0].params;
+    DictValue sc = lp.get("scales");
+    DictValue zp = lp.get("zeropoints");
+
+    for (int i = 0; i < sc.size(); i++)
+    {
+        scales.push_back(sc.get<float>(i));
+        zeropoints.push_back(zp.get<int>(i));
+    }
+}
+
+void Net::getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
+{
+    if (!impl->netWasQuantized)
+        CV_Error(Error::StsBadFunc, "Net isn't quantized");
+
+    std::vector<int> outLayerIds = getUnconnectedOutLayers();
+    for (auto &lid : outLayerIds)
+    {
+        LayerParams &lp = impl->layers[lid].params;
+        DictValue sc = lp.get("scales");
+        DictValue zp = lp.get("zeropoints");
+
+        for (int i = 0; i < sc.size(); i++)
+        {
+            scales.push_back(sc.get<float>(i));
+            zeropoints.push_back(zp.get<int>(i));
+        }
+    }
+}
+
 void Net::setPreferableBackend(int backendId)
 {
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG(backendId);
 
+    if (backendId == DNN_BACKEND_DEFAULT)
+        backendId = (Backend)PARAM_DNN_BACKEND_DEFAULT;
+
+    if (impl->netWasQuantized && backendId != DNN_BACKEND_OPENCV)
+    {
+        CV_LOG_WARNING(NULL, "DNN: Only default backend supports quantized networks");
+        backendId = DNN_BACKEND_OPENCV;
+    }
+
 #ifdef HAVE_INF_ENGINE
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
         backendId = getInferenceEngineBackendTypeParam();
@@ -4277,7 +4583,6 @@ void Net::setPreferableBackend(int backendId)
     if( impl->preferableBackend != backendId )
     {
         impl->preferableBackend = backendId;
-        impl->netWasAllocated = false;
         impl->clear();
     }
 }
@@ -4287,6 +4592,13 @@ void Net::setPreferableTarget(int targetId)
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG(targetId);
 
+    if (impl->netWasQuantized && targetId != DNN_TARGET_CPU &&
+        targetId != DNN_TARGET_OPENCL && targetId != DNN_TARGET_OPENCL_FP16)
+    {
+        CV_LOG_WARNING(NULL, "DNN: Only CPU and OpenCL/OpenCL FP16 target is supported by quantized networks");
+        targetId = DNN_TARGET_CPU;
+    }
+
     if( impl->preferableTarget != targetId )
     {
         impl->preferableTarget = targetId;
@@ -4306,7 +4618,6 @@ void Net::setPreferableTarget(int targetId)
                 impl->preferableTarget = DNN_TARGET_OPENCL;
 #endif
         }
-        impl->netWasAllocated = false;
         impl->clear();
     }
 }
@@ -4935,9 +5246,10 @@ void Net::getMemoryConsumption(const int layerId,
 
     ShapesVec inLayerShapes, outLayerShapes;
     getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
+    size_t elemSize = (impl->netWasQuantized) ? sizeof(char) : sizeof(float);
     for(int i = 0; i < outLayerShapes.size(); i++)
     {
-        blobs += total(outLayerShapes[i]) * sizeof(float);
+        blobs += total(outLayerShapes[i]) * elemSize;
     }
 }
 
@@ -4986,7 +5298,7 @@ void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
     std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
 
     getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
-
+    size_t elemSize = (impl->netWasQuantized) ? sizeof(char) : sizeof(float);
     for(int i = 0; i < layerIds.size(); i++)
     {
         int w = 0, b = 0;
@@ -5001,7 +5313,7 @@ void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
 
         for(int j = 0; j < outLayerShapes[i].size(); j++)
         {
-            b += total(outLayerShapes[i][j]) * sizeof(float);
+            b += total(outLayerShapes[i][j]) * elemSize;
         }
 
         weights.push_back(w);
@@ -5021,7 +5333,6 @@ void Net::enableFusion(bool fusion)
     if( impl->fusion != fusion )
     {
         impl->fusion = fusion;
-        impl->netWasAllocated = false;
         impl->clear();
     }
 }
@@ -5195,6 +5506,12 @@ void Layer::getScaleShift(Mat& scale, Mat& shift) const
     shift = Mat();
 }
 
+void Layer::getScaleZeropoint(float& scale, int& zeropoint) const
+{
+    scale = 1.f;
+    zeropoint = 0;
+}
+
 void Layer::unsetAttached()
 {
     setActivation(Ptr<ActivationLayer>());
@@ -5321,6 +5638,12 @@ void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::
     this->forward(inputs, outputs, internals);
 }
 
+bool Layer::tryQuantize(const std::vector<std::vector<float> > &scales,
+                        const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+{
+    return false;
+}
+
 Layer::~Layer() {}
 
 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 1916aa0ec9..9d8a3783a2 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -141,6 +141,44 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(LSTM,           LSTMLayer);
     CV_DNN_REGISTER_LAYER_CLASS(GRU,            GRULayer);
     CV_DNN_REGISTER_LAYER_CLASS(CumSum,         CumSumLayer);
+
+    CV_DNN_REGISTER_LAYER_CLASS(Quantize,         QuantizeLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Dequantize,       DequantizeLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ConvolutionInt8,  ConvolutionLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(InnerProductInt8, InnerProductLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(PoolingInt8,      PoolingLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(EltwiseInt8,      EltwiseLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(BatchNormInt8,    BatchNormLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(ScaleInt8,        ScaleLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(ShiftInt8,        ShiftLayerInt8);
+
+    CV_DNN_REGISTER_LAYER_CLASS(ReLUInt8,         ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(ReLU6Int8,        ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(SigmoidInt8,      ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(TanHInt8,         ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(SwishInt8,        ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(MishInt8,         ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(ELUInt8,          ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(BNLLInt8,         ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(AbsValInt8,       ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(SoftmaxInt8,      SoftmaxLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(SoftMaxInt8,      SoftmaxLayerInt8);
+
+    CV_DNN_REGISTER_LAYER_CLASS(ConcatInt8,       ConcatLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(FlattenInt8,      FlattenLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(PaddingInt8,      PaddingLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(BlankInt8,        BlankLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(DropoutInt8,      BlankLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(IdentityInt8,     BlankLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(SilenceInt8,      BlankLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ConstInt8,        ConstLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ReshapeInt8,      ReshapeLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(SplitInt8,        SplitLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(SliceInt8,        SliceLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(CropInt8,         CropLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(PermuteInt8,      PermuteLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ReorgInt8,        ReorgLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ShuffleChannelInt8, ShuffleChannelLayer);
 }
 
 CV__DNN_INLINE_NS_END
diff --git a/modules/dnn/src/int8layers/batch_norm_layer.cpp b/modules/dnn/src/int8layers/batch_norm_layer.cpp
new file mode 100644
index 0000000000..c5b8c3d9e9
--- /dev/null
+++ b/modules/dnn/src/int8layers/batch_norm_layer.cpp
@@ -0,0 +1,178 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class BatchNormLayerInt8Impl CV_FINAL : public BatchNormLayerInt8
+{
+public:
+    Mat origin_weights, origin_bias;
+    Mat weights_, bias_;
+    mutable int dims;
+
+    BatchNormLayerInt8Impl(const LayerParams& params)
+        : dims(-1)
+    {
+        setParamsFrom(params);
+        useGlobalStats = params.get<bool>("use_global_stats", true);
+        input_sc = params.get<float>("input_scale");
+        input_zp = params.get<int>("input_zeropoint");
+        output_sc = params.get<float>("scales");
+        output_zp = params.get<int>("zeropoints");
+
+        CV_Assert(blobs.size() == 2);
+        size_t n = blobs[0].total();
+        CV_Assert(blobs[1].total() == n &&
+                  blobs[0].isContinuous() && blobs[1].isContinuous() &&
+                  blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
+
+        origin_weights = blobs[0];
+        origin_bias = blobs[1];
+    }
+
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        origin_weights.convertTo(weights_, CV_32F, input_sc/output_sc);
+        addWeighted(origin_bias, 1.0/output_sc, weights_, -input_zp, output_zp, bias_, CV_32F);
+    }
+
+    void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        scale = origin_weights;
+        shift = origin_bias;
+    }
+
+    void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE
+    {
+        scale = output_sc;
+        zeropoint = output_zp;
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Mat w_, b_;
+        top->getScaleShift(w_, b_);
+        if (w_.empty() && b_.empty())
+            return false;
+
+        const int numChannels = weights_.total();
+        const int numFusedWeights = w_.total();
+        const int numFusedBias = b_.total();
+
+        if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w_.empty()) ||
+            (numFusedBias != numChannels && numFusedBias != 1 && !b_.empty()))
+            return false;
+
+        float new_sc;
+        int new_zp;
+        top->getScaleZeropoint(new_sc, new_zp);
+
+        Mat w = numFusedWeights == 1 ? Mat(1, numChannels, CV_32F, Scalar(w_.at<float>(0))) :
+                (w_.empty() ? Mat::ones(1, numChannels, CV_32F) : w_.reshape(1, 1));
+
+        Mat b = numFusedBias == 1 ? Mat(1, numChannels, CV_32F, Scalar(b_.at<float>(0))) :
+                (b_.empty() ? Mat::zeros(1, numChannels, CV_32F) : b_.reshape(1, 1));
+
+        weights_ = Mat(); bias_ = Mat();
+        multiply(origin_weights, w, weights_, input_sc/new_sc, CV_32F);
+        multiply(origin_bias, w, bias_);
+        add(bias_, b, bias_);
+        addWeighted(bias_, 1.0/new_sc, weights_, -input_zp, new_zp, bias_, CV_32F);
+        return true;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        dims = inputs[0].size();
+        if (!useGlobalStats && inputs[0][0] != 1)
+            CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1");
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            return activ_int8->blobs.empty();
+        }
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(blobs.size() == 2);
+        CV_Assert(inputs.size() == 1);
+
+        Mat &inpBlob = inputs[0];
+        int planeSize = 1;
+        for (size_t i = 2; i < inpBlob.dims; i++) {
+            planeSize *= inpBlob.size[i];
+        }
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Mat &outBlob = outputs[ii];
+
+            for(int num = 0; num < outBlob.size[0]; num++)
+            {
+                for (int n = 0; n < outBlob.size[1]; n++)
+                {
+                    float w = weights_.at<float>(n);
+                    float b = bias_.at<float>(n);
+                    Mat inpBlobPlane(1, planeSize, CV_8S, inpBlob.ptr<int8_t>(num, n));
+                    Mat outBlobPlane(1, planeSize, CV_8S, outBlob.ptr<int8_t>(num, n));
+                    inpBlobPlane.convertTo(outBlobPlane, CV_8S, w, b);
+                }
+            }
+        }
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+
+        int64 flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 3*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    bool useGlobalStats;
+};
+
+Ptr<BatchNormLayerInt8> BatchNormLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<BatchNormLayerInt8>(new BatchNormLayerInt8Impl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
new file mode 100644
index 0000000000..05749885c0
--- /dev/null
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -0,0 +1,1136 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <opencv2/core/utils/logger.hpp>
+
+#include "opencv2/core/hal/hal.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include <iostream>
+#include <numeric>
+
+namespace cv
+{
+namespace dnn
+{
+
+#if CV_SIMD
+static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
+                                    v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
+{
+    v_int16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+
+    v_int32x4 t0, t1;
+    v_mul_expand(a0, b0, t0, t1);
+    out0 += t0; out1 += t1;
+
+    v_mul_expand(a1, b1, t0, t1);
+    out2 += t0; out3 += t1;
+}
+#endif
+
+class BaseConvolutionLayerInt8Impl : public ConvolutionLayerInt8
+{
+public:
+    BaseConvolutionLayerInt8Impl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        getConvolutionKernelParams(params, kernel_size, pads_begin, pads_end, strides, dilations, padMode, adjust_pads);
+
+        numOutput = params.get<int>("num_output");
+        int ngroups = params.get<int>("group", 1);
+        CV_Assert(numOutput % ngroups == 0);
+
+        input_zp = params.get<int>("input_zeropoint");
+        output_zp = params.get<int>("zeropoints");
+        output_sc = params.get<float>("scales");
+
+        if (kernel_size.size() == 2) {
+            kernel = Size(kernel_size[1], kernel_size[0]);
+            stride = Size(strides[1], strides[0]);
+            for (int i = 0; i < pads_begin.size(); i++) {
+                if (pads_begin[i] != pads_end[i])
+                    CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer");
+            }
+            pad = Size(pads_begin[1], pads_begin[0]);
+            dilation = Size(dilations[1], dilations[0]);
+
+            adjustPad.height = adjust_pads[0];
+            adjustPad.width = adjust_pads[1];
+        }
+
+        for (int i = 0; i < adjust_pads.size(); i++) {
+            CV_Assert(adjust_pads[i] < strides[i]);
+        }
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        // blobs[0] - Weights (INT8)
+        // blobs[1] - Biases (INT32)
+        // blobs[2] - Multipliers for convolution output stage (FP32)
+        CV_Assert(!inputs.empty() && blobs.size() == 3);
+        MatSize weightShape = blobs[0].size;
+
+        CV_Assert(inputs[0].dims == outputs[0].dims);
+        if (weightShape.dims() == 3)
+        {
+            kernel_size.assign(1, kernel_size[0]);
+            strides.assign(1, strides[0]);
+            dilations.assign(1, dilations[0]);
+            pads_begin.assign(1, pads_begin[0]);
+            pads_end.assign(1, pads_end[0]);
+        }
+        CV_Assert(weightShape.dims() == kernel_size.size() + 2);
+        for (int i = 0; i < kernel_size.size(); i++) {
+            CV_Assert(weightShape[i + 2] == kernel_size[i]);
+        }
+
+        const Mat &input = inputs[0];
+        CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && input.type() == CV_8S);
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            CV_Assert(inputs[i].type() == input.type());
+            CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
+            for (int j = 0; j < inputs[i].dims; j++) {
+                CV_Assert(inputs[i].size[j] == input.size[j]);
+            }
+        }
+
+        std::vector<int> inpShape;
+        std::vector<int> outShape;
+        for (int i = 2; i < inputs[0].dims; i++) {
+            inpShape.push_back(inputs[0].size[i]);
+            outShape.push_back(outputs[0].size[i]);
+        }
+        getConvPoolPaddings(inpShape, kernel_size, strides, padMode, pads_begin, pads_end);
+        if (pads_begin.size() == 2) {
+            for (int i = 0; i < pads_begin.size(); i++) {
+                if (pads_begin[i] != pads_end[i])
+                    CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer");
+            }
+            pad = Size(pads_begin[1], pads_begin[0]);
+        }
+    }
+
+    virtual MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const = 0;
+    bool is1x1() const
+    {
+        return (kernel.height == 1 && kernel.width == 1) &&
+               (stride.height == 1 && stride.width == 1) &&
+               (dilation.height == 1 && dilation.width == 1);
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Mat w, b;
+        top->getScaleShift(w, b);
+        if (w.empty() && b.empty())
+            return false;
+
+        CV_Assert((w.empty() || w.type() == CV_32F) &&
+                  (b.empty() || b.type() == CV_32F));
+
+        float new_sc;
+        int new_zp;
+        top->getScaleZeropoint(new_sc, new_zp);
+        fuseWeights(w, b, new_sc);
+        output_sc = new_sc;
+        output_zp = new_zp;
+        return true;
+    }
+
+    virtual void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) = 0;
+};
+
+//TODO: simultaneously convolution and bias addition for cache optimization
+class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
+{
+public:
+    enum { VEC_ALIGN = 32, DFT_TYPE = CV_8S };
+    Mat weightsMat;
+    std::vector<int> biasvec;
+    Mat outputMultiplier;
+    Mat activationLUT;
+    Ptr<ActivationLayerInt8> activ;
+
+    ConvolutionLayerInt8Impl(const LayerParams &params) : BaseConvolutionLayerInt8Impl(params){}
+
+    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
+    {
+        CV_Assert(!blobs.empty());
+        int dims = inpShape.size();
+        int inpD = dims == 5 ? inpShape[2] : 1;
+        int inpH = inpShape[dims - 2];
+        int inpW = inpShape.back();
+        int inpGroupCn = blobs[0].size[1];
+        int ksize = inpGroupCn * std::accumulate(kernel_size.begin(), kernel_size.end(),
+                                                 1, std::multiplies<size_t>());
+        return shape(inpD * inpH * inpW, ksize);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        size_t ksize = kernel_size.size();
+        // Only default backend and Conv1D/Conv2D/Conv3D are supported
+        return backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(!blobs.empty());
+        const int* weightShape = blobs[0].size.p;
+        CV_Assert(blobs[1].total() == (size_t)weightShape[0]);
+
+        internals.clear();
+
+        CV_Assert(inputs.size() != 0);
+        std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
+
+        int outCn = weightShape[0];
+        std::vector<int> outShape;
+        outShape.push_back(inputs[0][0]);
+        outShape.push_back(outCn);
+
+        int inpCn = inputs[0][1];
+        if (padMode.empty())
+        {
+            for (int i = 0; i < inpShape.size(); i++)
+                outShape.push_back((inpShape[i] + pads_begin[i] + pads_end[i] - dilations[i] * (kernel_size[i] - 1) - 1) / strides[i] + 1);
+        }
+        else
+        {
+            getConvPoolOutParams(inpShape, kernel_size, strides, padMode, dilations, outShape);
+        }
+
+        int ngroups = inpCn / weightShape[1];
+        if (ngroups == 0 || ngroups * weightShape[1] != inpCn)
+            CV_Error(Error::StsError, format("Number of input channels should "
+                     "be multiple of %d but got %d", weightShape[1], inpCn));
+        CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0);
+
+        outputs.resize(1, outShape);
+
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        BaseConvolutionLayerInt8Impl::finalize(inputs_arr, outputs_arr);
+
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        // prepare weightsMat where each row is aligned and has enough zero padding on the right to
+        // use vectorized (i.e. with intrinsics) loops without tail processing
+        Mat wm = blobs[0].reshape(1, numOutput);
+        if( wm.step1() % VEC_ALIGN != 0 )
+        {
+            int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
+            Mat wm_buffer = Mat(numOutput, newcols, wm.type());
+            Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
+            wm_padding.setTo(Scalar::all(0));
+            Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
+            wm.copyTo(wm_aligned);
+            wm = wm_aligned;
+        }
+        weightsMat = wm;
+
+        Mat biasMat = blobs[1];
+        biasvec.resize(numOutput+2);
+        for(int i = 0; i < numOutput; i++ )
+            biasvec[i] = biasMat.at<int>(i);
+
+        outputMultiplier = blobs[2];
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            activ = activ_int8;
+            if (!activ_int8->blobs.empty())
+                activ_int8->blobs[0].convertTo(activationLUT, CV_32S);
+            return true;
+        }
+        return false;
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        return BaseConvolutionLayerInt8Impl::tryFuse(top);
+    }
+
+    void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) CV_OVERRIDE
+    {
+        const int outCn = weightsMat.size[0];
+        Mat w = w_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(w_.at<float>(0))) : w_;
+        Mat b = b_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(b_.at<float>(0))) : b_;
+        CV_Assert_N(!weightsMat.empty(), biasvec.size() == outCn + 2,
+                    w.empty() || outCn == w.total(), b.empty() || outCn == b.total());
+
+        for (int i = 0; i < outCn; ++i)
+        {
+            float off = outputMultiplier.at<float>(i) * output_sc;
+            if (!w.empty())
+                off *= w.at<float>(i);
+
+            if (!b.empty())
+                biasvec[i] += (int)std::round(b.at<float>(i)/off);
+
+            outputMultiplier.at<float>(i) = off/new_sc;
+        }
+        biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
+    }
+
+    class ParallelConv : public cv::ParallelLoopBody
+    {
+    public:
+        enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 };
+
+        const Mat* input_;
+        const Mat* weights_;
+        Mat* output_;
+        int outShape[4]; // used only for conv2d
+        std::vector<size_t> kernel_size, pads_begin, pads_end, strides, dilations;
+        int ngroups_, nstripes_;
+        std::vector<int> ofstab_;
+        const std::vector<int>* biasvec_;
+        const Mat* activLUT_;
+        const ActivationLayerInt8* activ_;
+        bool is1x1_;
+        bool useAVX2;
+        bool useAVX512;
+        int blk_size_cn;
+        int inpZp, outZp;
+        const float* multiplier;
+
+        ParallelConv()
+            : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
+              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false)
+            , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
+        {}
+
+        static void run( const Mat& input, Mat& output, const Mat& weights, const Mat& multipliers,
+                         const std::vector<int>& biasvec, const Mat& activLUT,
+                         const std::vector<size_t>& kernel_size, const std::vector<size_t>& strides,
+                         const std::vector<size_t>& pads_begin, const std::vector<size_t>& pads_end,
+                         const std::vector<size_t>& dilations,
+                         const ActivationLayerInt8* activ, int ngroups, int nstripes, int inp_Zp, int out_Zp)
+        {
+            size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(),
+                                           1, std::multiplies<size_t>());
+            bool isConv1D = input.dims == 3;
+            bool isConv2D = input.dims == 4;
+            bool isConv3D = input.dims == 5;
+            CV_CheckEQ(static_cast<int>(kernel_size.size()), input.dims - 2, "");
+            CV_Assert_N(input.dims == output.dims,
+                       input.size[0] == output.size[0],
+                       weights.rows == output.size[1],
+                       weights.cols == (input.size[1]/ngroups)*karea,
+                       input.type() == CV_8SC1,
+                       output.type() == CV_32SC1,
+                       input.type() == weights.type(),
+                       input.isContinuous(),
+                       output.isContinuous(),
+                       biasvec.size() == (size_t)output.size[1]+2);
+            CV_Check(weights.step1(), weights.step1() % VEC_ALIGN == 0, "");
+            ParallelConv p;
+
+            p.input_ = &input;
+            p.weights_ = &weights;
+            p.output_ = &output;
+            int max_ind = isConv1D? 3: 4;
+            for( int i = 0; i < max_ind; i++ ) p.outShape[i] = output.size[i];
+            p.outShape[1] /= ngroups;
+
+            p.kernel_size = kernel_size; p.strides = strides; p.dilations = dilations;
+            p.pads_begin = pads_begin; p.pads_end = pads_end;
+
+            p.ngroups_ = ngroups;
+            p.nstripes_ = nstripes;
+
+            int inpCnAll = input.size[1];
+            int depth = (input.dims == 5) ? input.size[2] : 1;
+            int width = input.size[input.dims - 1];
+            int height = isConv1D? 1 : input.size[input.dims - 2];
+            int inpCn = inpCnAll / ngroups;
+
+            p.is1x1_ = (isConv2D && kernel_size[0] == 1 && kernel_size[1] == 1 &&
+                       pads_begin[0] == 0  && pads_begin[1] == 0) ||
+                       (isConv1D && pads_begin[0] == 0 && kernel_size[0] == 1);
+
+            p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
+            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
+
+            int kernel_d = isConv3D? kernel_size[0] : 1;
+            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
+            int kernel_w = kernel_size.back();
+
+            int blk_size_cn0 = cvCeil(1600./(kernel_w*kernel_h));
+            int ncn = 32;
+            while (ncn*2 < blk_size_cn0 && ncn < inpCn)
+                ncn *= 2;
+            ncn = std::min(ncn, inpCn);
+            p.blk_size_cn = ncn;
+
+            int dil_d = isConv3D? dilations[0] : 1;
+            int dil_h = isConv1D? 1 : dilations[dilations.size() - 2];
+            int dil_w = dilations.back();
+
+            p.inpZp = inp_Zp;
+            p.outZp = out_Zp;
+            p.multiplier = multipliers.ptr<float>(0);
+
+            p.ofstab_.resize(karea * ncn);
+            int* ofstab = &p.ofstab_[0];
+
+            if (isConv1D)
+            {
+                for( int k = 0; k < ncn; k++ )
+                    for( int k_c = 0; k_c < kernel_w; k_c++ )
+                        ofstab[k*kernel_w + k_c] = k*width + k_c*dil_w;
+            }
+            else if (isConv2D)
+            {
+                for( int k = 0; k < ncn; k++ )
+                    for( int k_r = 0; k_r < kernel_h; k_r++ )
+                        for( int k_c = 0; k_c < kernel_w; k_c++ )
+                            ofstab[(k*kernel_h + k_r)*kernel_w + k_c] =
+                                   (k*height + k_r*dil_h)*width + k_c*dil_w;
+            }
+            else
+            {
+                for( int k = 0; k < ncn; k++ )
+                    for (int k_d = 0; k_d < kernel_d; k_d++)
+                        for( int k_r = 0; k_r < kernel_h; k_r++ )
+                            for( int k_c = 0; k_c < kernel_w; k_c++ )
+                                ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] =
+                                       (k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w;
+            }
+
+            p.biasvec_ = &biasvec;
+            p.activLUT_ = &activLUT;
+            p.activ_ = !activLUT.empty() ? activ : 0;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        virtual void operator ()(const Range &r0) const CV_OVERRIDE
+        {
+            const int valign = ConvolutionLayerInt8Impl::VEC_ALIGN;
+            int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
+            bool isConv1D = input_->dims == 3;
+            bool isConv2D = input_->dims == 4;
+            bool isConv3D = input_->dims == 5;
+
+            int outW = output_->size[output_->dims - 1];
+            int outH = isConv1D? 1 : output_->size[output_->dims - 2];
+            int outCn = output_->size[1]/ngroups;
+
+            int depth = isConv3D? input_->size[2] : 1;
+            int height = isConv1D? 1 : input_->size[input_->dims - 2];
+            int width = input_->size[input_->dims - 1];
+            int inpCn = input_->size[1]/ngroups;
+
+            const int nstripes = nstripes_;
+
+            int kernel_d = isConv3D? kernel_size[0] : 1;
+            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
+            int kernel_w = kernel_size.back();
+            int karea = kernel_w*kernel_h*kernel_d;
+
+            int pad_d = isConv3D? pads_begin[0] : 0;
+            int pad_t = isConv1D? 0 : pads_begin[pads_begin.size() - 2];
+            int pad_l = pads_begin.back();
+
+            int stride_d = isConv3D? strides[0] : 0;
+            int stride_h = isConv1D? 0 : strides[strides.size() - 2];
+            int stride_w = strides.back();
+
+            int dilation_d = isConv3D? dilations[0] : 1;
+            int dilation_h = isConv1D? 1 : dilations[dilations.size() - 2];
+            int dilation_w = dilations.back();
+
+            int i, j, k, d;
+            int inpPlaneSize = (int)input_->total(2);
+            int outPlaneSize = (int)output_->total(2);
+            bool is1x1 = is1x1_;
+
+            int stripesPerSample;
+            int stripeSize;
+            Range r = r0;
+            bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
+                outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
+                width >= 16 + dilation_w*(kernel_w - 1);
+            // for now only 3x3 depth-wise convolutions are supported
+            depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
+                // computing at most 1 pixel from each side can involve padding
+                max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
+                pad_l <= 1 && pad_t <= 1;
+
+            if( !depthWiseConvolution && nstripes >= batchSize*2 )
+            {
+                stripesPerSample = nstripes/batchSize;
+                stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, 8);
+                stripeSize = std::min(stripeSize, outPlaneSize);
+            }
+            else
+            {
+                stripesPerSample = 1;
+                int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
+                r.start *= samplesPerStripe;
+                r.end *= samplesPerStripe;
+                stripeSize = outPlaneSize;
+            }
+
+            const int8_t* data_inp0_ = input_->ptr<int8_t>();
+            const int* ofstab = &ofstab_[0];
+            const int8_t* wptr_orig_ = weights_->ptr<int8_t>();
+            size_t wstep = weights_->step1();
+            const int* biasptr_ = &biasvec_->at(0);
+            const int* lutptr_ = !activLUT_->empty() ? activLUT_->ptr<int>() : 0;
+            int* data_out0_ = output_->ptr<int>();
+            AutoBuffer<int8_t> rowbuf0_;
+            int8_t* rowbuf0 = 0;
+            bool use_rowbuf = !depthWiseConvolution;
+            int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);
+
+            // im2row buffer is not used for depth-wise convolution
+            if(use_rowbuf)
+            {
+                size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
+                //printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
+                rowbuf0_.allocate(rowbufsz + valign);
+                rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(int8_t)));
+                // we clear the buffer once; ultimately, it lets us to avoid
+                // tail processing after running the unrolled/vectorized loop.
+                // the main idea is to make sure that the tail (a.k.a. padding) of each row
+                // (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
+                // does not contain NaNs or Infs. Because the padding in the weights
+                // matrix is explicitly initialized with 0's, we handle all other
+                // cases nicely, i.e. we can skip expliciting re-initialization
+                // of the padding - we just retain elements from the previous iteration
+                // of the loop over channels (cn0).
+                memset(rowbuf0, (int8_t)inpZp, rowbufsz*sizeof(rowbuf0[0]) );
+            }
+
+            for( int stripe = r.start; stripe < r.end; stripe++ )
+            {
+                int subsampleIdx = stripe/stripesPerSample;
+                if( subsampleIdx >= batchSize )
+                    break;
+                int stripeStart = (int)((stripe - subsampleIdx*stripesPerSample)*stripeSize);
+                int stripeEnd = (int)std::min(stripeStart + stripeSize, outPlaneSize);
+                const int8_t* data_inp0 = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn;
+                int* data_out0 = data_out0_ + subsampleIdx*outPlaneSize*outCn;
+                int startOutCn = (subsampleIdx % ngroups)*outCn;
+                const int8_t* wptr_orig = wptr_orig_ + wstep*startOutCn;
+                const int* biasptr = biasptr_ + startOutCn;
+                const float* multptr = multiplier + startOutCn;
+
+                for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
+                {
+                    int cn1 = std::min(cn0 + blk_size_cn, inpCn);
+                    int ncn = cn1 - cn0, vsz = karea*ncn;
+                    int vsz_a = (int)alignSize(vsz, valign);
+                    const int8_t* wptr = wptr_orig + cn0*karea;
+
+                    for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
+                    {
+                        int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
+                        int bsz = ofs1 - ofs0;
+
+                        int out_d = ofs0 / (outH * outW);
+                        int out_i = (ofs0 - out_d * outH * outW) / outW;
+                        int out_j = ofs0 % outW;
+
+                        if (depthWiseConvolution)
+                        {
+                            CV_Assert(out_i == 0 && out_j == 0);
+                            int in_d = out_d * stride_d - pad_d;
+                            const int8_t* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
+                            int* outptr_ = data_out0 + ofs0;
+
+                        #if CV_TRY_AVX2
+                            if(useAVX2)
+                                opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
+                        #endif
+                            {
+                                const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                                             w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                                             w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+                                int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+                                int bias = biasptr[out_d], biasCopy;
+                                float mult = multptr[out_d];
+
+                                for (int out_i = 0; out_i < outH; out_i++)
+                                {
+                                    int in_i = out_i * stride_h - pad_t, out_j = 0;
+                                    const int8_t* imgptr0 = inptr_ + in_i*width;
+                                    const int8_t* imgptr1 = imgptr0 + dilation_h*width;
+                                    const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
+                                    int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+                                    int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+                                    int out, out1;
+                                    // Bias has a fused offset component. bias = bias_quantized - input_zeropoint*sum_of_weights.
+                                    // In some cases below, certain weights are not used for convolution or set to zero.
+                                    // So we create a copy of bias at the start and remove the weight's components as necessary.
+                                    biasCopy = bias;
+
+                                    if (in_i < 0)
+                                    {
+                                        biasCopy += inpZp * (w00 + w01 + w02);
+                                        w00 = w01 = w02 = 0;
+                                        imgptr0 = imgptr1;
+                                    }
+                                    else if (in_i + dilation_h*(kernel_h-1) >= height)
+                                    {
+                                        biasCopy += inpZp * (w20 + w21 + w22);
+                                        w20 = w21 = w22 = 0;
+                                        imgptr2 = imgptr1;
+                                    }
+                                    int* outptr = outptr_ + out_i*outW;
+                                    if (pad_l > 0)
+                                    {
+                                        out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
+                                              (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
+                                              (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
+                                              biasCopy + inpZp*(w00 + w10 + w20);
+                                        out1 = outZp + (int)std::round(out*mult);
+                                        outptr[0] = std::min(std::max(out1, -128), 127);
+                                        out_j = 1;
+                                    }
+                                #if CV_SIMD
+                                    if( stride_w == 1 )
+                                    {
+                                        const int out_delta = 16;
+                                        v_int8x16 vw00 = v_setall_s8(w00), vw01 = v_setall_s8(w01), vw02 = v_setall_s8(w02),
+                                                  vw10 = v_setall_s8(w10), vw11 = v_setall_s8(w11), vw12 = v_setall_s8(w12),
+                                                  vw20 = v_setall_s8(w20), vw21 = v_setall_s8(w21), vw22 = v_setall_s8(w22);
+                                        v_int32x4 vout0, vout1, vout2, vout3, vbias = v_setall_s32(biasCopy), voutzp = v_setall_s32(outZp),
+                                                  outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
+                                        v_float32x4 vmult = v_setall_f32(mult);
+                                        for( ; out_j < outW1; out_j += out_delta )
+                                        {
+                                            if (out_j + out_delta > outW1)
+                                            {
+                                                if (out_j <= pad_l)
+                                                    break;
+                                                out_j = outW1 - out_delta;
+                                            }
+                                            int in_j = out_j * stride_w - pad_l;
+                                            v_int8x16 v00 = v_load(imgptr0 + in_j),
+                                                      v01 = v_load(imgptr0 + in_j + dilation_w),
+                                                      v02 = v_load(imgptr0 + in_j + dilation_w*2),
+                                                      v10 = v_load(imgptr1 + in_j),
+                                                      v11 = v_load(imgptr1 + in_j + dilation_w),
+                                                      v12 = v_load(imgptr1 + in_j + dilation_w*2),
+                                                      v20 = v_load(imgptr2 + in_j),
+                                                      v21 = v_load(imgptr2 + in_j + dilation_w),
+                                                      v22 = v_load(imgptr2 + in_j + dilation_w*2);
+
+                                            vout0 = vout1 = vout2 = vout3 = vbias;
+                                            v_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                                            vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult);
+                                            vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult);
+                                            vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult);
+                                            vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult);
+
+                                            vout0 = v_min(v_max(vout0, outmin), outmax);
+                                            vout1 = v_min(v_max(vout1, outmin), outmax);
+                                            vout2 = v_min(v_max(vout2, outmin), outmax);
+                                            vout3 = v_min(v_max(vout3, outmin), outmax);
+
+                                            v_store(outptr + out_j, vout0);
+                                            v_store(outptr + out_j + 4, vout1);
+                                            v_store(outptr + out_j + 8, vout2);
+                                            v_store(outptr + out_j + 12, vout3);
+                                        }
+                                    }
+                                #endif
+                                    for (; out_j < outW1; out_j++)
+                                    {
+                                        int in_j = out_j * stride_w - pad_l;
+                                        out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
+                                              (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
+                                              (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
+                                        out1 = outZp + (int)std::round(out*mult);
+                                        outptr[out_j] = std::min(std::max(out1, -128), 127);
+                                    }
+
+                                    for (; out_j < outW; out_j++ )
+                                    {
+                                        int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+                                        int s0 = 1, s1 = 1, s2 = 1;
+                                        if (in_j0 >= width)
+                                        {
+                                            in_j0 = 0;
+                                            s0 = 0;
+                                            biasCopy += inpZp*(w00 + w10 + w20);
+                                        }
+                                        if (in_j1 >= width)
+                                        {
+                                            in_j1 = 0;
+                                            s1 = 0;
+                                            biasCopy += inpZp*(w01 + w11 + w21);
+                                        }
+                                        if (in_j2 >= width)
+                                        {
+                                            in_j2 = 0;
+                                            s2 = 0;
+                                            biasCopy += inpZp*(w02 + w12 + w22);
+                                        }
+                                        out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
+                                              (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
+                                              (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
+                                        out1 = outZp + (int)std::round(out*mult);
+                                        outptr[out_j] = std::min(std::max(out1, -128), 127);
+                                    }
+                                }
+                            }
+                            continue;
+                        }
+                        // do im2row for a part of input tensor
+                        int8_t* rowbuf = rowbuf0;
+
+                        if (isConv1D)
+                        {
+                            for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
+                            {
+                                int delta = std::min(ofs1 - ofs, outW - out_j);
+                                int out_j1 = out_j + delta;
+
+                                int in_j = out_j * stride_w - pad_l;
+                                const int8_t* imgptr = data_inp0 + cn0*width + in_j;
+                                ofs += delta;
+
+                                // do im2row for a part of input tensor
+                                if( is1x1 )
+                                {
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                            rowbuf[k] = imgptr[k*inpPlaneSize];
+                                    }
+                                }
+                                else
+                                {
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
+                                    {
+                                        // this condition should be true for most of the tensor elements, i.e.
+                                        // most of the time the kernel aperture is inside the tensor X-Y plane.
+                                        if( out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
+                                        {
+                                            for( k = 0; k < vsz; k++ )
+                                            {
+                                                int k1 = ofstab[k];
+                                                int8_t v0 = imgptr[k1];
+                                                int8_t v1 = imgptr[k1 + stride_w];
+                                                rowbuf[k] = v0;
+                                                rowbuf[k+vsz_a] = v1;
+                                            }
+                                            out_j++;
+                                            rowbuf += vsz_a;
+                                            imgptr += stride_w;
+                                            in_j += stride_w;
+                                        }
+                                        else
+                                        {
+                                            int i0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                            int i1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                            // here some non-continuous sub-row of the row will not be
+                                            // filled from the tensor; we need to make sure that the uncovered
+                                            // elements are explicitly set to 0's. the easiest way is to
+                                            // set all the elements to 0's before the loop.
+                                            memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
+                                            for( k = 0; k < ncn; k++ )
+                                            {
+                                                for( i = i0; i < i1; i++ )
+                                                {
+                                                    int imgofs = k*width + i*dilation_w;
+                                                    rowbuf[k*kernel_w + i] = imgptr[imgofs];
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else if (isConv2D)
+                        {
+                            if( is1x1 && stride_w == 1 && stride_h == 1 )
+                            {
+                                const int8_t* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
+                                for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
+                                {
+                                    if( j + 4 <= bsz )
+                                    {
+                                        k = 0;
+                                        for( ; k < vsz; k++ )
+                                        {
+                                            const int8_t* inp = imgptr + j + k*inpPlaneSize;
+                                            int8_t v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
+                                            rowbuf[k] = v0;
+                                            rowbuf[k + vsz_a] = v1;
+                                            rowbuf[k + vsz_a*2] = v2;
+                                            rowbuf[k + vsz_a*3] = v3;
+                                        }
+                                        j += 3;
+                                        rowbuf += vsz_a*3;
+                                    }
+                                    else
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                        {
+                                            rowbuf[k] = imgptr[j + k*inpPlaneSize];
+                                        }
+                                    }
+                                }
+                            }
+                            else
+                            for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
+                            {
+                                int delta = std::min(ofs1 - ofs, outW - out_j);
+                                int out_j1 = out_j + delta;
+
+                                int in_i = out_i * stride_h - pad_t;
+                                int in_j = out_j * stride_w - pad_l;
+                                const int8_t* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
+                                ofs += delta;
+
+                                // do im2row for a part of input tensor
+                                if( is1x1 )
+                                {
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                            rowbuf[k] = imgptr[k*inpPlaneSize];
+                                    }
+                                }
+                                else
+                                {
+                                    bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h;
+                                    int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
+                                    int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
+
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
+                                    {
+                                        // this condition should be true for most of the tensor elements, i.e.
+                                        // most of the time the kernel aperture is inside the tensor X-Y plane.
+                                        if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
+                                        {
+                                            for( k = 0; k < vsz; k++ )
+                                            {
+                                                int k1 = ofstab[k];
+                                                int8_t v0 = imgptr[k1];
+                                                int8_t v1 = imgptr[k1 + stride_w];
+                                                rowbuf[k] = v0;
+                                                rowbuf[k+vsz_a] = v1;
+                                            }
+                                            out_j++;
+                                            rowbuf += vsz_a;
+                                            imgptr += stride_w;
+                                            in_j += stride_w;
+                                        }
+                                        else
+                                        {
+                                            int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                            int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                            // here some non-continuous sub-row of the row will not be
+                                            // filled from the tensor; we need to make sure that the uncovered
+                                            // elements are explicitly set to 0's. the easiest way is to
+                                            // set all the elements to 0's before the loop.
+                                            memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
+                                            for( k = 0; k < ncn; k++ )
+                                            {
+                                                for( i = i0; i < i1; i++ )
+                                                {
+                                                    for( j = j0; j < j1; j++ )
+                                                    {
+                                                        int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
+                                                        rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1) / outH, out_i = (out_i + 1) % outH, out_j = 0 )
+                            {
+                                int delta = std::min(ofs1 - ofs, outW - out_j);
+                                int out_j1 = out_j + delta;
+
+                                int in_d = out_d * stride_d - pad_d;
+                                int in_i = out_i * stride_h - pad_t;
+                                int in_j = out_j * stride_w - pad_l;
+                                const int8_t* imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j;
+                                ofs += delta;
+
+                                int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
+                                int d1 = std::min(kernel_d, (depth - in_d + dilation_d - 1) / dilation_d);
+
+                                int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
+                                int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
+
+                                for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
+                                {
+                                    int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                    int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                    // here some non-continuous sub-row of the row will not be
+                                    // filled from the tensor; we need to make sure that the uncovered
+                                    // elements are explicitly set to 0's. the easiest way is to
+                                    // set all the elements to 0's before the loop.
+                                    memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
+                                    for( k = 0; k < ncn; k++ )
+                                    {
+                                        for ( d = d0; d < d1; d++)
+                                        {
+                                            for( i = i0; i < i1; i++ )
+                                            {
+                                                for( j = j0; j < j1; j++ )
+                                                {
+                                                    int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w;
+                                                    rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        // now compute dot product of the weights
+                        // and im2row-transformed part of the tensor
+                    #if CV_TRY_AVX512_SKX
+                        if(useAVX512)
+                            opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
+                    #endif
+                    #if CV_TRY_AVX2
+                        if(useAVX2)
+                            opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
+                    #endif
+                        for( int i = 0; i < outCn; i += 2 )
+                        {
+                            const int8_t* wptr0 = wptr + i*wstep;
+                            const int8_t* wptr1 = wptr0 + wstep;
+                            int* outptr0 = data_out0 + ofs0 + i*outPlaneSize;
+                            int* outptr1 = outptr0 + outPlaneSize;
+                            int bias0 = biasptr[i], bias1 = biasptr[i+1];
+                            float mult0 = multptr[i], mult1 = multptr[i+1];
+
+                            if( i+1 >= outCn )
+                            {
+                                wptr1 = wptr0;
+                                outptr1 = outptr0;
+                                bias1 = bias0;
+                                mult1 = mult0;
+                            }
+                            int j = 0;
+                        #if CV_SIMD128
+                            v_int32x4 voutzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
+                            v_float32x4 vmult0 = v_setall_f32(mult0), vmult1 = v_setall_f32(mult1);
+                            for( ; j <= bsz - 4; j += 4 )
+                            {
+                                const int8_t* rptr = rowbuf0 + j*vsz_a;
+                                v_int32x4 s0, s1;
+
+                                if( cn0 == 0 )
+                                {
+                                    s0 = v_setall_s32(bias0);
+                                    s1 = v_setall_s32(bias1);
+                                }
+                                else
+                                {
+                                    s0 = v_load(outptr0 + j);
+                                    s1 = v_load(outptr1 + j);
+                                }
+
+                                v_int32x4 vs00 = v_setzero_s32(), vs01 = v_setzero_s32(),
+                                          vs02 = v_setzero_s32(), vs03 = v_setzero_s32(),
+                                          vs10 = v_setzero_s32(), vs11 = v_setzero_s32(),
+                                          vs12 = v_setzero_s32(), vs13 = v_setzero_s32();
+                                for( k = 0; k < vsz; k += 16, rptr += 16 )
+                                {
+                                    v_int8x16 w0 = v_load_aligned(wptr0 + k);
+                                    v_int8x16 w1 = v_load_aligned(wptr1 + k);
+                                    v_int8x16 r0 = v_load_aligned(rptr);
+                                    v_int8x16 r1 = v_load_aligned(rptr + vsz_a);
+                                    v_int8x16 r2 = v_load_aligned(rptr + vsz_a*2);
+                                    v_int8x16 r3 = v_load_aligned(rptr + vsz_a*3);
+
+                                    vs00 = v_dotprod_expand_fast(w0, r0, vs00);
+                                    vs01 = v_dotprod_expand_fast(w0, r1, vs01);
+                                    vs02 = v_dotprod_expand_fast(w0, r2, vs02);
+                                    vs03 = v_dotprod_expand_fast(w0, r3, vs03);
+
+                                    vs10 = v_dotprod_expand_fast(w1, r0, vs10);
+                                    vs11 = v_dotprod_expand_fast(w1, r1, vs11);
+                                    vs12 = v_dotprod_expand_fast(w1, r2, vs12);
+                                    vs13 = v_dotprod_expand_fast(w1, r3, vs13);
+                                }
+                                s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03));
+                                s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13));
+                                if( cn1 == inpCn )
+                                {
+                                    s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0);
+                                    s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1);
+
+                                    s0 = v_min(v_max(s0, outmin), outmax);
+                                    s1 = v_min(v_max(s1, outmin), outmax);
+                                }
+                                v_store(outptr0 + j, s0);
+                                v_store(outptr1 + j, s1);
+                            }
+                        #endif
+                            for( ; j < bsz; j++ )
+                            {
+                                const int8_t* rptr = rowbuf0 + j*vsz_a;
+                                int s00, s10;
+
+                                if( cn0 == 0 )
+                                {
+                                    s00 = bias0;
+                                    s10 = bias1;
+                                }
+                                else
+                                {
+                                    s00 = outptr0[j];
+                                    s10 = outptr1[j];
+                                }
+
+                                for( k = 0; k < vsz; k++ )
+                                {
+                                    int8_t r0 = rptr[k];
+                                    s00 += (int)wptr0[k] * r0;
+                                    s10 += (int)wptr1[k] * r0;
+                                }
+                                if( cn1 == inpCn )
+                                {
+                                    int out0 = outZp + (int)std::round(s00*mult0);
+                                    int out1 = outZp + (int)std::round(s10*mult1);
+
+                                    s00 = std::min(std::max(out0, -128), 127);
+                                    s10 = std::min(std::max(out1, -128), 127);
+                                }
+
+                                outptr0[j] = s00;
+                                outptr1[j] = s10;
+                            }
+                        }
+                    }
+                }
+                if( activ_ )
+                    activ_->forwardSlice(data_out0 + stripeStart, lutptr_,
+                                         data_out0 + stripeStart, (int)(stripeEnd - stripeStart),
+                                         outPlaneSize, startOutCn, startOutCn + outCn);
+            }
+        }
+    };
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+#if CV_SSE3
+        uint32_t ftzMode = _MM_GET_FLUSH_ZERO_MODE();
+        uint32_t dazMode = _MM_GET_DENORMALS_ZERO_MODE();
+        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        /*if (inputs[0].dims > 3) {
+            printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
+                   name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
+                   kernel.width, kernel.height, pad.width, pad.height,
+                   stride.width, stride.height, dilation.width, dilation.height);
+        }
+        else {
+            printf("conv %s: input (%d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
+                   name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2],
+                   kernel.width, kernel.height, pad.width, pad.height,
+                   stride.width, stride.height, dilation.width, dilation.height);
+        }*/
+
+        int inpGroupCn = blobs[0].size[1];
+        CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % inpGroupCn == 0,
+                    outputs.size() == 1, inputs[0].data != outputs[0].data);
+
+        int ngroups = inputs[0].size[1] / inpGroupCn;
+        CV_Assert(outputs[0].size[1] % ngroups == 0);
+
+        int nstripes = std::max(getNumThreads(), 1);
+        Mat outputInt32 = Mat(shape(outputs[0]), CV_32S);
+
+        ParallelConv::run(inputs[0], outputInt32, weightsMat, outputMultiplier, biasvec, activationLUT, kernel_size, strides,
+                          pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes, input_zp, output_zp);
+
+        outputInt32.convertTo(outputs[0], CV_8S);
+
+#if CV_SSE3
+        _MM_SET_FLUSH_ZERO_MODE(ftzMode);
+        _MM_SET_DENORMALS_ZERO_MODE(dazMode);
+#endif
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == outputs.size());
+
+        int64 flops = 0;
+        int karea = std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<size_t>());
+        for (int i = 0; i < outputs.size(); i++)
+        {
+            flops += total(outputs[i])*(CV_BIG_INT(2)*karea*inputs[i][1] + 1);
+        }
+        return flops;
+    }
+};
+
+Ptr<BaseConvolutionLayer> ConvolutionLayerInt8::create(const LayerParams &params)
+{
+    return Ptr<BaseConvolutionLayer>(new ConvolutionLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/elementwise_layers.cpp b/modules/dnn/src/int8layers/elementwise_layers.cpp
new file mode 100644
index 0000000000..75118b6bc1
--- /dev/null
+++ b/modules/dnn/src/int8layers/elementwise_layers.cpp
@@ -0,0 +1,190 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+#include <iostream>
+
+namespace cv
+{
+namespace dnn
+{
+
+class ActivationLayerInt8Impl CV_FINAL : public ActivationLayerInt8
+{
+public:
+    ActivationLayerInt8Impl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        activationLUT = !blobs.empty() ? blobs[0] : Mat();
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+    class Activation : public cv::ParallelLoopBody
+    {
+    public:
+        const Mat* src;
+        const Mat* lut;
+        Mat* dst;
+        int nstripes;
+
+        Activation() : src(0), lut(0), dst(0), nstripes(0){}
+
+        static void run(const Mat& src, const Mat& lut, Mat& dst, int nstripes)
+        {
+            Activation p;
+
+            p.src = &src;
+            p.lut = &lut;
+            p.dst = &dst;
+            p.nstripes = nstripes;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range &r) const CV_OVERRIDE
+        {
+            const int8_t* table = lut->ptr<int8_t>();
+            int nsamples = 1, outCn = 1;
+            size_t planeSize = 1;
+
+            if (src->dims > 1)
+            {
+                nsamples = src->size[0];
+                outCn = src->size[1];
+            }
+            else
+                outCn = src->size[0];
+
+            for (int i = 2; i < src->dims; ++i)
+                planeSize *= src->size[i];
+
+            size_t stripeSize = (planeSize + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, planeSize);
+            int len = (int)(stripeEnd - stripeStart);
+
+            for( int i = 0; i < nsamples; i++ )
+            {
+                const int8_t* srcptr = src->ptr<int8_t>(i) + stripeStart;
+                int8_t* dstptr = dst->ptr<int8_t>(i) + stripeStart;
+                for( int cn = 0; cn < outCn; cn++, srcptr += planeSize, dstptr += planeSize )
+                {
+                    int i = 0;
+#if CV_SIMD128
+                    for( ; i <= len - 16; i += 16 )
+                    {
+                        v_int8x16 out(table[srcptr[i] + 128], table[srcptr[i+1] + 128], table[srcptr[i+2] + 128], table[srcptr[i+3] + 128],
+                                      table[srcptr[i+4] + 128], table[srcptr[i+5] + 128], table[srcptr[i+6] + 128], table[srcptr[i+7] + 128],
+                                      table[srcptr[i+8] + 128], table[srcptr[i+9] + 128], table[srcptr[i+10] + 128], table[srcptr[i+11] + 128],
+                                      table[srcptr[i+12] + 128], table[srcptr[i+13] + 128], table[srcptr[i+14] + 128], table[srcptr[i+15] + 128]);
+                        v_store(dstptr + i, out);
+                    }
+#endif
+                    for( ; i < len; i++ )
+                    {
+                        dstptr[i] = table[srcptr[i] + 128];
+                    }
+                }
+            }
+        }
+    };
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            const Mat &src = inputs[i];
+            if (!activationLUT.empty())
+            {
+                const int nstripes = getNumThreads();
+                Mat &dst = outputs[i];
+                CV_Assert(src.size == dst.size && src.type() == dst.type() &&
+                          src.isContinuous() && dst.isContinuous() && src.type() == CV_8S);
+
+                Activation::run(src, activationLUT, dst, nstripes);
+            }
+            else
+            {
+                src.copyTo(outputs[i]);
+            }
+        }
+    }
+
+    void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
+    {
+        for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize )
+        {
+            int i = 0;
+#if CV_SIMD128
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_int8x16 out(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128],
+                              lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128],
+                              lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128],
+                              lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]);
+                v_store(dst + i, out);
+            }
+#endif
+            for( ; i < len; i++ )
+                dst[i] = lut[src[i] + 128];
+        }
+    }
+
+    void forwardSlice(const int* src, const int* lut, int* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
+    {
+        for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize )
+        {
+            int i = 0;
+#if CV_SIMD128
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_int32x4 out0(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128]);
+                v_int32x4 out1(lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128]);
+                v_int32x4 out2(lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128]);
+                v_int32x4 out3(lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]);
+
+                v_store(dst + i, out0);
+                v_store(dst + i + 4, out1);
+                v_store(dst + i + 8, out2);
+                v_store(dst + i + 12, out3);
+            }
+#endif
+            for( ; i < len; i++ )
+                dst[i] = lut[src[i] + 128];
+        }
+
+    }
+
+    Mat activationLUT;
+};
+
+Ptr<ActivationLayerInt8> ActivationLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<ActivationLayerInt8>(new ActivationLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/eltwise_layer.cpp b/modules/dnn/src/int8layers/eltwise_layer.cpp
new file mode 100644
index 0000000000..be7a32b1ef
--- /dev/null
+++ b/modules/dnn/src/int8layers/eltwise_layer.cpp
@@ -0,0 +1,577 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class EltwiseLayerInt8Impl CV_FINAL : public EltwiseLayerInt8
+{
+public:
+    enum EltwiseOp
+    {
+        PROD = 0,
+        SUM = 1,
+        MAX = 2
+    } op;
+    std::vector<float> coeffs;
+    std::vector<int> zeropoints;
+
+    enum OutputChannelsMode
+    {
+        ELTWISE_CHANNNELS_SAME = 0,              //!< number of channels from inputs must be the same and equal to output's number of channels
+        ELTWISE_CHANNNELS_INPUT_0,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< number of channels of other inputs should not be greater than number of channels of first input
+        ELTWISE_CHANNNELS_INPUT_0_TRUNCATE,      //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< there is restriction on number of channels of other inputs
+                                                 //!< extra channels of other inputs is ignored
+        ELTWISE_CHANNNELS_USE_MAX,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to maximal number of input channels
+                                                 //!< @note supported operation: `SUM`
+    } channelsModeInput;
+
+
+    mutable OutputChannelsMode channelsMode;     //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
+    mutable /*size_t*/int outputChannels;
+
+    EltwiseLayerInt8Impl(const LayerParams& params)
+        : outputChannels(0)
+    {
+        setParamsFrom(params);
+        offset = params.get<float>("offset", 0.f);
+        hasVecInput = false;
+        op = SUM;
+        if (params.has("operation"))
+        {
+            String operation = toLowerCase(params.get<String>("operation"));
+            if (operation == "prod")
+                op = PROD;
+            else if (operation == "sum")
+                op = SUM;
+            else if (operation == "max")
+                op = MAX;
+            else
+                CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
+        }
+
+        if (params.has("coeff"))
+        {
+            DictValue paramCoeff = params.get("coeff");
+            int i, n = paramCoeff.size();
+            coeffs.resize(n);
+            for (i = 0; i < n; i++)
+            {
+                coeffs[i] = paramCoeff.get<float>(i);
+            }
+        }
+
+        if (params.has("input_zeropoints"))
+        {
+            DictValue zp = params.get("input_zeropoints");
+            int i, n = zp.size();
+            zeropoints.resize(n);
+            for (i = 0; i < n; i++)
+            {
+                zeropoints[i] = zp.get<int>(i);
+            }
+        }
+
+        channelsModeInput = ELTWISE_CHANNNELS_SAME;
+        if (params.has("output_channels_mode"))
+        {
+            String v = toLowerCase(params.get<String>("output_channels_mode"));
+            if (v == "same")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_SAME;
+            }
+            else if (v == "input_0")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0;
+            }
+            else if (v == "input_0_truncate")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE;
+            }
+            else if (v == "max_input_channels")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_USE_MAX;
+                if (op != SUM)
+                    CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only");
+            }
+            else
+                CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\"");
+        }
+        channelsMode = channelsModeInput;
+
+        // TODO Must have checks for other unknown options
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() >= 2);
+        CV_Assert(inputs[0].size() >= 2);
+        CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
+        CV_Assert(op == SUM || op == PROD || coeffs.size() == 0);
+
+        int dims = inputs[0].size();
+        // Number of channels in output shape is determined by the first input tensor.
+        bool variableChannels = false;
+        int numChannels = inputs[0][1];
+        for (size_t i = 1; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[0][0] == inputs[i][0]);  // batch sizes are equal
+
+            int input_channels = inputs[i][1];
+            if (numChannels != input_channels)
+                variableChannels = true;
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME)
+            {
+                CV_Assert(numChannels == input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0)
+            {
+                CV_Assert(numChannels >= input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+            {
+                // nothing to check
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX)
+            {
+                numChannels = std::max(numChannels, input_channels);
+            }
+            else
+            {
+                CV_Assert(0 && "Internal error");
+            }
+        }
+
+        channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
+        outputChannels = numChannels;
+
+        outputs.assign(1, inputs[0]);
+        outputs[0][1] = numChannels;
+
+        if (dims > 2)
+        {
+            size_t vecIdx = 0;
+            bool isVecFound = false;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                bool allOnes = isAllOnes(inputs[i], 2, dims);
+                if (!allOnes && !isVecFound)
+                {
+                    vecIdx = i;
+                    isVecFound = true;
+                }
+
+                if (!allOnes && i != vecIdx)
+                {
+                    for (size_t j = 2; j < dims; j++)
+                    {
+                         CV_Assert(inputs[vecIdx][j] == inputs[i][j]);
+                    }
+                }
+            }
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME && isVecFound)
+            {
+                for (size_t j = 2; j < dims; j++)
+                {
+                    outputs[0][j] = inputs[vecIdx][j];
+                }
+            }
+        }
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape inpShape = shape(inputs[i].size);
+            if (isAllOnes(inpShape, 2, inputs[i].dims))
+            {
+                hasVecInput = true;
+                return;
+            }
+        }
+    }
+
+    class EltwiseInvoker : public ParallelLoopBody
+    {
+        EltwiseLayerInt8Impl& self;
+        std::vector<const Mat*> srcs;
+        std::vector<int> srcNumChannels;
+        int nsrcs;
+        Mat* dst;
+        Mat* buf;
+        std::vector<float> coeffs;
+        std::vector<int> zeropoints;
+        int nstripes;
+        const Mat* activLUT;
+        const ActivationLayerInt8* activ;
+        int channels;
+        size_t planeSize;
+        float offset;
+
+        EltwiseInvoker(EltwiseLayerInt8Impl& self_)
+            : self(self_)
+            , nsrcs(0), dst(0), buf(0), nstripes(0), activ(0), channels(0)
+            , planeSize(0), offset(0)
+        {}
+
+    public:
+        static void run(EltwiseLayerInt8Impl& self,
+                        const Mat* srcs, int nsrcs, Mat& buf, Mat& dst,
+                        int nstripes, float offset)
+        {
+            const EltwiseOp op = self.op;
+            CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_8SC1, ""); CV_Assert(dst.isContinuous());
+            CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs);
+            CV_CheckGE(nsrcs, 2, "");
+
+            CV_Assert(self.outputChannels == dst.size[1]);
+
+            EltwiseInvoker p(self);
+            p.srcs.resize(nsrcs);
+            p.srcNumChannels.resize(nsrcs);
+            p.coeffs = self.coeffs;  // can be sorted
+            p.zeropoints = self.zeropoints;
+
+            bool sortInputs = false;
+            for( int i = 0; i < nsrcs; i++ )
+            {
+                p.srcs[i] = &srcs[i];
+                CV_CheckEQ(srcs[i].dims, dst.dims, "");
+                CV_Assert(srcs[i].isContinuous());
+                CV_Assert(srcs[i].type() == dst.type());
+                p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1;
+
+                if (self.channelsMode == ELTWISE_CHANNNELS_SAME)
+                {
+                    CV_Assert(srcs[i].size == dst.size);
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX)
+                {
+                    CV_Assert(op == SUM);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else
+                {
+                    CV_Assert(0 && "Internal error");
+                }
+
+                if (sortInputs)
+                {
+                    // Sort srcs and coefficients in the desc order by number of channels
+                    for (int j = i; j >= 1; j--)
+                    {
+                        if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1]))
+                        {
+                            std::swap(p.srcs[j - 1], p.srcs[j]);
+                            std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]);
+                            if (!p.coeffs.empty())
+                                std::swap(p.coeffs[j - 1], p.coeffs[j]);
+                            if (!p.zeropoints.empty())
+                                std::swap(p.zeropoints[j - 1], p.zeropoints[j]);
+                        }
+                        else
+                            break;
+                    }
+                }
+            }
+
+            p.nsrcs = nsrcs;
+            p.dst = &dst;
+            p.buf = &buf;
+            p.nstripes = nstripes;
+            p.offset = offset;
+            p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
+
+            p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
+            CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, "");
+            p.activLUT = &self.activationLUT;
+            p.activ = !self.activationLUT.empty() ? self.activ.get() : 0;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            const EltwiseOp op = self.op;
+            size_t total = dst->size[0]*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, total);
+            const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
+            const int* zeropointsptr = !zeropoints.empty() ? &zeropoints[0] : 0;
+            const int8_t* lutptr = !activLUT->empty() ? activLUT->ptr<int8_t>() : 0;
+            int8_t* dstptr0 = dst->ptr<int8_t>();
+            float* bufptr0 = buf->ptr<float>();
+            int blockSize0 = 1 << 12;
+
+            for (size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / planeSize);
+                int delta = (int)ofs - sampleIdx * planeSize;
+                int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
+                if( blockSize <= 0 )
+                    break;
+                ofs += blockSize;
+
+                for (int c = 0; c < channels; c++)
+                {
+                    size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize;
+                    int8_t* dstptr = dstptr0 + dstIdx;
+                    float* bufptr = bufptr0 + dstIdx;
+
+                    // process first two inputs
+                    {
+                        const int8_t* srcptr0 = srcs[0]->ptr<int8_t>() + dstIdx;
+
+                        const int inputIdx = 1;
+                        int src1_channels = srcNumChannels[inputIdx];
+                        if (c >= src1_channels)
+                        {
+                            // no data from second input
+                            if (!coeffsptr)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j];
+                                }
+                            }
+                            else
+                            {
+                                float c0 = coeffsptr[0];
+                                int z0 = op == PROD ? zeropointsptr[0] : 0;
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    bufptr[j] = c0 * (srcptr0[j] - z0);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize;
+                            const int8_t* srcptrI = srcs[inputIdx]->ptr<int8_t>() + srcIdx;
+
+                            if (op == PROD)
+                            {
+                                float c0 = coeffsptr[0];
+                                float c1 = coeffsptr[1];
+                                int z0 = zeropointsptr[0];
+                                int z1 = zeropointsptr[1];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    bufptr[j] = (c0*(srcptr0[j] - z0)) * (c1*(srcptrI[j] - z1));
+                                }
+                            }
+                            else if (op == MAX)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = std::max(srcptr0[j], srcptrI[j]);
+                                }
+                            }
+                            else if (op == SUM)
+                            {
+                                float c0 = coeffsptr[0];
+                                float c1 = coeffsptr[1];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    bufptr[j] = c0*srcptr0[j] + c1*srcptrI[j];
+                                }
+                            }
+                            else
+                                CV_Error(Error::StsInternal, "");
+                        }
+                    }
+
+                    // aggregate other inputs (3+)
+                    for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++)
+                    {
+                        int srcI_channels = srcNumChannels[inputIdx];
+                        if (c >= srcI_channels)
+                            continue;  // no data from second input
+                        size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize;
+                        const int8_t* srcptrI = srcs[inputIdx]->ptr<int8_t>() + srcIdx;
+
+                        if (op == PROD)
+                        {
+                            float cI = coeffsptr[inputIdx];
+                            int zI = zeropointsptr[inputIdx];
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                bufptr[j] *= cI*(srcptrI[j] - zI);
+                            }
+                        }
+                        else if (op == MAX)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] = std::max(dstptr[j], srcptrI[j]);
+                            }
+                        }
+                        else if (op == SUM)
+                        {
+                            float cI = coeffsptr[inputIdx];
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                bufptr[j] += cI * srcptrI[j];
+                            }
+                        }
+                        else
+                            CV_Error(Error::StsInternal, "");
+                    }
+
+                    // add offset and saturate cast to int8
+                    if (op == SUM || op == PROD)
+                    {
+                        for (int j = 0; j < blockSize; j++)
+                        {
+                            dstptr[j] = saturate_cast<int8_t>(std::round(bufptr[j] + offset));
+                        }
+                    }
+                }
+                if( activ )
+                {
+                    int8_t* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
+                    activ->forwardSlice(ptr, lutptr, ptr, blockSize, planeSize, 0, channels);
+                }
+            }
+        }
+    };
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(outputs.size() == 1);
+        const int nstripes = getNumThreads();
+
+        if (channelsModeInput == ELTWISE_CHANNNELS_SAME && inputs[0].dims > 2)
+        {
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                MatShape inpShape = shape(inputs[i].size);
+                bool allOnes = isAllOnes(inpShape, 2, inputs[i].dims);
+
+                if (allOnes)
+                {
+                    Mat tmpInput = inputs[i];
+                    MatShape outShape = shape(outputs[0].size);
+                    size_t xSize = outShape[2];
+                    for (size_t j = 3; j < outShape.size(); j++)
+                        xSize *= outShape[j];
+
+                    int dimVec[3] = {outShape[0], outShape[1], (int) xSize};
+                    std::vector<int> matSizesVec(&dimVec[0], &dimVec[0] + 3);
+                    inputs[i] = Mat(matSizesVec, tmpInput.type());
+
+                    std::vector<int> idx(outShape.size(), 0);
+                    std::vector<int> outIdx(inpShape.size(), 0);
+
+                    for (size_t j = 0; j < outShape[0]; j++)
+                    {
+                        outIdx[0] = idx[0] = j;
+                        for(size_t k = 0; k < outShape[1]; k++)
+                        {
+                            outIdx[1] = idx[1] = k;
+                            for (size_t x = 0; x < xSize; x++)
+                            {
+                                outIdx[2] = x;
+                                inputs[i].at<int8_t>(outIdx.data()) = tmpInput.at<int8_t>(idx.data());
+                            }
+                        }
+                    }
+                    inputs[i] = inputs[i].reshape(0, outShape);
+                }
+            }
+        }
+
+        Mat buf = Mat(shape(outputs[0]), CV_32F); // to store intermediate results
+        EltwiseInvoker::run(*this, &inputs[0], (int)inputs.size(), buf, outputs[0], nstripes, offset);
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        CV_Assert(inputs.size());
+
+        // FIXIT: handle inputs with different number of channels
+        long flops = inputs.size() * total(inputs[0]);
+
+        return flops;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            activ = activ_int8;
+            if (!activ_int8->blobs.empty())
+                activationLUT = activ_int8->blobs[0];
+            return true;
+        }
+        return false;
+    }
+
+    Mat activationLUT;
+    Ptr<ActivationLayerInt8> activ;
+
+private:
+    bool hasVecInput;
+    float offset;
+};
+
+Ptr<EltwiseLayerInt8> EltwiseLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<EltwiseLayerInt8>(new EltwiseLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
new file mode 100644
index 0000000000..83da677a47
--- /dev/null
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -0,0 +1,266 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
+{
+public:
+    enum { VEC_ALIGN = 32 };
+    FullyConnectedLayerInt8Impl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        output_zp = params.get<int>("zeropoints");
+        axis = params.get<int>("axis", 1);
+        if (blobs.size() == 3)
+        {
+            // blobs[0] - Weights
+            // blobs[1] - Bias fused with offset
+            // blobs[2] - Multipliers for output stage
+            int numOutput = params.get<int>("num_output");
+            int innerSize = (int)blobs[0].total() / numOutput;
+
+            CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
+            CV_Assert((size_t)numOutput == blobs[1].total());
+
+            weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
+            int vecsize = weightsMat.cols;
+            if (vecsize % VEC_ALIGN != 0)
+            {
+                int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+                Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
+                Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
+                wpadding.setTo(Scalar::all(0));
+                weightsMat = weightsBuf.colRange(0, vecsize);
+                blobs[0].copyTo(weightsMat);
+            }
+            biasMat = blobs[1] = blobs[1].reshape(1, 1);
+            outputMultiplier = blobs[2];
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &) const CV_OVERRIDE
+    {
+        int numOutput, cAxis;
+        CV_CheckEQ(inputs.size(), (size_t)1, "");
+        CV_CheckEQ(blobs[0].dims, 2, "");
+        numOutput = blobs[0].size[0];
+        CV_Assert((size_t)numOutput == blobs[1].total());
+        cAxis = normalize_axis(axis, inputs[0]);
+
+        MatShape outShape(cAxis + 1);
+        for (int i = 0; i < cAxis; ++i)
+            outShape[i] = inputs[0][i];
+        outShape.back() = numOutput;
+
+        outputs.resize(1, outShape);
+        return false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            activ = activ_int8;
+            if (!activ_int8->blobs.empty())
+                activ_int8->blobs[0].convertTo(activationLUT, CV_32S);
+            return true;
+        }
+        return false;
+    }
+
+    class FullyConnected : public ParallelLoopBody
+    {
+    public:
+        FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
+                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {}
+
+        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
+                        const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
+        {
+            CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
+                       dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
+                       srcMat.type() == weights.type() && srcMat.type() == CV_8S &&
+                       dstMat.type() == CV_32S && biasMat.type() == CV_32S &&
+                       biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols );
+
+            FullyConnected p;
+
+            p.srcMat = &srcMat;
+            p.weights = &weights;
+            p.biasMat = &biasMat;
+            p.outputMultiplier = &outputMultiplier;
+            p.activationLUT = &activationLUT;
+            p.dstMat = &dstMat;
+            p.nstripes = nstripes;
+            p.outZp = outZp;
+            p.activ = !activationLUT.empty() ? activ : 0;
+            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
+            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int valign = FullyConnectedLayerInt8Impl::VEC_ALIGN;
+            int nsamples = srcMat->rows;
+            int nw0 = weights->rows;
+            int k, vecsize = srcMat->cols;
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+            size_t total = (size_t)nsamples*nw0;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
+            size_t wstep = weights->step1();
+            AutoBuffer<int8_t> srcbuf(vecsize_aligned + valign);
+            int8_t* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(int8_t)));
+            const int* lutptr = !activationLUT->empty() ? activationLUT->ptr<int>() : 0;
+
+            for( k = vecsize; k < vecsize_aligned; k++ )
+                sptr[k] = 0;
+
+            for( size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / nw0);
+                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
+                const int8_t* sptr_ = srcMat->ptr<int8_t>(sampleIdx);
+                const int8_t* wptr = weights->ptr<int8_t>(delta);
+                int* dptr = dstMat->ptr<int>(sampleIdx) + delta;
+                const int* biasptr = biasMat->ptr<int>() + delta;
+                const float* multptr = outputMultiplier->ptr<float>() + delta;
+                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
+
+                memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
+            #if CV_TRY_AVX512_SKX
+                if( useAVX512 )
+                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
+            #endif
+            #if CV_TRY_AVX2
+                if( useAVX2 )
+                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
+            #endif
+                {
+                    int i = 0;
+            #if CV_SIMD
+                    for( ; i  <= nw - 4; i += 4, wptr += 4*wstep )
+                    {
+                        v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
+                                  vs2 = v_setzero_s32(), vs3 = v_setzero_s32();
+                        v_int32x4 outzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
+                        v_int32x4 s = v_load(biasptr + i);
+                        v_float32x4 mult = v_load(multptr + i);
+
+                        for( k = 0; k < vecsize; k += 16 )
+                        {
+                            v_int8x16 v = v_load_aligned(sptr + k);
+                            vs0 = v_dotprod_expand_fast(v, v_load_aligned(wptr + k), vs0);
+                            vs1 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep + k), vs1);
+                            vs2 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*2 + k), vs2);
+                            vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3);
+                        }
+
+                        s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3));
+                        v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult);
+                        v_store(dptr + i, v_min(v_max(out, outmin), outmax));
+                    }
+            #endif
+
+                    for( ; i < nw; i++, wptr += wstep )
+                    {
+                        int s0 = biasptr[i];
+                        float mult0 = multptr[i];
+
+                        for( k = 0; k < vecsize; k++ )
+                        {
+                            int8_t v = sptr[k];
+                            s0 += (int)v*wptr[k];
+                        }
+                        int out0 = outZp + (int)std::round(s0*mult0);
+                        dptr[i] = std::min(std::max(out0, -128), 127);
+                    }
+                }
+
+                if(activ)
+                    activ->forwardSlice(dptr, lutptr, dptr, 1, 1, delta, delta + nw);
+
+                ofs += nw;
+            }
+        }
+
+        const Mat *srcMat, *weights, *biasMat, *outputMultiplier, *activationLUT;
+        const ActivationLayerInt8* activ;
+        Mat* dstMat;
+        int nstripes, outZp;
+        bool useAVX2;
+        bool useAVX512;
+    };
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> input, output;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+
+        int axisCan = normalize_axis(axis, input[0].dims);
+        int outerSize = input[0].total(0, axisCan);
+        Mat srcMat = input[0].reshape(1, outerSize);
+
+        Mat dstMat = output[0].reshape(1, outerSize);
+        Mat dstMatInt32= Mat(shape(dstMat), CV_32S);
+
+        const int nstripes = getNumThreads();
+        FullyConnected::run(srcMat, weightsMat, biasMat, outputMultiplier, activationLUT, dstMatInt32, activ.get(), nstripes, output_zp);
+        dstMatInt32.convertTo(dstMat, CV_8S);
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(inputs); // suppress unused variable warning
+        long flops = 0;
+
+        int innerSize = blobs[0].size[1];
+        for(int i = 0; i < outputs.size(); i++)
+        {
+            flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
+        }
+
+        return flops;
+
+    }
+
+    Mat weightsMat, biasMat, outputMultiplier, activationLUT;
+    Ptr<ActivationLayerInt8> activ;
+};
+
+Ptr<InnerProductLayerInt8> InnerProductLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<InnerProductLayerInt8>(new FullyConnectedLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/layers_common.hpp b/modules/dnn/src/int8layers/layers_common.hpp
new file mode 100644
index 0000000000..cb185a9eda
--- /dev/null
+++ b/modules/dnn/src/int8layers/layers_common.hpp
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
+#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
+#include <opencv2/dnn.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+
+#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+// dispatched AVX/AVX2 optimizations
+#include "./layers_common.simd.hpp"
+#include "int8layers/layers_common.simd_declarations.hpp"
+#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/ocl4dnn.hpp"
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin,
+                                std::vector<size_t>& pads_end, std::vector<size_t>& strides, std::vector<size_t>& dilations,
+                                cv::String &padMode, std::vector<size_t>& adjust_pads);
+
+void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<bool>& globalPooling,
+                            std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end, std::vector<size_t>& strides, cv::String &padMode);
+
+void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& stride, const String &padMode,
+                          const std::vector<size_t>& dilation, std::vector<int>& out);
+
+ void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& strides, const String &padMode,
+                          std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end);
+}
+}
+
+#endif
diff --git a/modules/dnn/src/int8layers/layers_common.simd.hpp b/modules/dnn/src/int8layers/layers_common.simd.hpp
new file mode 100644
index 0000000000..bf6149e5c9
--- /dev/null
+++ b/modules/dnn/src/int8layers/layers_common.simd.hpp
@@ -0,0 +1,637 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv {
+namespace dnn {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput );
+void fastDepthwiseConv( const int8_t* wptr,
+                        int kernel_h, int kernel_w,
+                        int stride_h, int stride_w,
+                        int dilation_h, int dilation_w,
+                        int pad_t, int pad_l,
+                        const int* biasptr, const float* multptr,
+                        const int8_t* inptr_,
+                        int height, int width,
+                        int* outptr_,
+                        int out_d, int outH, int outW,
+                        int inpZp, int outZp );
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp );
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX2
+#define OPENCV_FMADD_EPI8(_Tpvec, func) \
+    inline _Tpvec _##func##_fmaddepi8_epi32(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { \
+        _Tpvec even_a = _##func##_srai_epi16(_##func##_bslli_epi128(a, 1), 8); \
+        _Tpvec odd_a  = _##func##_srai_epi16(a, 8);                            \
+                                                                               \
+        _Tpvec even_b = _##func##_srai_epi16(_##func##_bslli_epi128(b, 1), 8); \
+        _Tpvec odd_b  = _##func##_srai_epi16(b, 8);                            \
+                                                                               \
+        _Tpvec prod0  = _##func##_madd_epi16(even_a, even_b);                  \
+        _Tpvec prod1  = _##func##_madd_epi16(odd_a, odd_b);                    \
+        return _##func##_add_epi32(_##func##_add_epi32(prod0, prod1), c);      \
+    }
+OPENCV_FMADD_EPI8(__m256i, mm256)
+//OPENCV_FMADD_EPI8(__m512i, mm512)
+
+enum { FASCONV_BASE_VECSZ = 4 };
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
+    int rsz = blockSize % FASCONV_BASE_VECSZ;
+    for( int i = 0; i < rsz; i++ )
+        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
+    __m128 mask = _mm_loadu_ps((const float*)maskbuf);
+
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const int8_t* wptr0 = weights + i*wstep;
+        const int8_t* wptr1 = wptr0 + wstep;
+        const int8_t* wptr2 = wptr1 + wstep;
+        int* outptr0 = output + i*outPlaneSize;
+        int* outptr1 = outptr0 + outPlaneSize;
+        int* outptr2 = outptr1 + outPlaneSize;
+        int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+        float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            mult2 = mult1;
+
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+                mult2 = mult1 = mult0;
+            }
+        }
+        int j = 0;
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
+        {
+            bool tail = false;
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                if (j == 0)
+                    break;
+                j = blockSize - FASCONV_BASE_VECSZ;
+                tail = true;
+            }
+            int k = 0;
+            const int8_t* rptr = rowbuf + j*vecsize_aligned;
+
+            __m256i vs00 = _mm256_setzero_si256(), vs01 = _mm256_setzero_si256(),
+                    vs02 = _mm256_setzero_si256(), vs03 = _mm256_setzero_si256(),
+                    vs10 = _mm256_setzero_si256(), vs11 = _mm256_setzero_si256(),
+                    vs12 = _mm256_setzero_si256(), vs13 = _mm256_setzero_si256(),
+                    vs20 = _mm256_setzero_si256(), vs21 = _mm256_setzero_si256(),
+                    vs22 = _mm256_setzero_si256(), vs23 = _mm256_setzero_si256();
+
+            /* TODO : Fix AVX-512 path. Segmentation fault in Conv2D Tests.
+#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling
+            if (vecsize >= 64)
+            {
+                __m512i vs00_5 = _mm512_setzero_si512(), vs01_5 = _mm512_setzero_si512(),
+                        vs02_5 = _mm512_setzero_si512(), vs03_5 = _mm512_setzero_si512(),
+                        vs10_5 = _mm512_setzero_si512(), vs11_5 = _mm512_setzero_si512(),
+                        vs12_5 = _mm512_setzero_si512(), vs13_5 = _mm512_setzero_si512(),
+                        vs20_5 = _mm512_setzero_si512(), vs21_5 = _mm512_setzero_si512(),
+                        vs22_5 = _mm512_setzero_si512(), vs23_5 = _mm512_setzero_si512();
+
+                for (; k <= vecsize - 64; k += 64, rptr += 64)
+                {
+                    __m512i w0 = _mm512_load_si512(wptr0 + k);
+                    __m512i w1 = _mm512_load_si512(wptr1 + k);
+                    __m512i w2 = _mm512_load_si512(wptr2 + k);
+                    __m512i r0 = _mm512_load_si512(rptr);
+
+                    vs00_5 = _mm512_fmaddepi8_epi32(w0, r0, vs00_5);
+                    vs10_5 = _mm512_fmaddepi8_epi32(w1, r0, vs10_5);
+                    vs20_5 = _mm512_fmaddepi8_epi32(w2, r0, vs20_5);
+
+                    r0 = _mm512_load_si512(rptr + vecsize_aligned);
+                    vs01_5 = _mm512_fmaddepi8_epi32(w0, r0, vs01_5);
+                    vs11_5 = _mm512_fmaddepi8_epi32(w1, r0, vs11_5);
+                    vs21_5 = _mm512_fmaddepi8_epi32(w2, r0, vs21_5);
+
+                    r0 = _mm512_load_si512(rptr + vecsize_aligned*2);
+                    vs02_5 = _mm512_fmaddepi8_epi32(w0, r0, vs02_5);
+                    vs12_5 = _mm512_fmaddepi8_epi32(w1, r0, vs12_5);
+                    vs22_5 = _mm512_fmaddepi8_epi32(w2, r0, vs22_5);
+
+                    r0 = _mm512_load_si512(rptr + vecsize_aligned*3);
+                    vs03_5 = _mm512_fmaddepi8_epi32(w0, r0, vs03_5);
+                    vs13_5 = _mm512_fmaddepi8_epi32(w1, r0, vs13_5);
+                    vs23_5 = _mm512_fmaddepi8_epi32(w2, r0, vs23_5);
+                }
+
+                // now fold the 512 bit accumulator vectors into 256 bit vectors so that the AVX2 code can finish
+                // the tail of the vector
+
+                vs00 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs00_5, 0), _mm512_extracti32x8_epi32(vs00_5, 1));
+                vs10 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs10_5, 0), _mm512_extracti32x8_epi32(vs10_5, 1));
+                vs20 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs20_5, 0), _mm512_extracti32x8_epi32(vs20_5, 1));
+
+                vs01 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs01_5, 0), _mm512_extracti32x8_epi32(vs01_5, 1));
+                vs11 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs11_5, 0), _mm512_extracti32x8_epi32(vs11_5, 1));
+                vs21 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs21_5, 0), _mm512_extracti32x8_epi32(vs21_5, 1));
+
+                vs02 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs02_5, 0), _mm512_extracti32x8_epi32(vs02_5, 1));
+                vs12 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs12_5, 0), _mm512_extracti32x8_epi32(vs12_5, 1));
+                vs22 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs22_5, 0), _mm512_extracti32x8_epi32(vs22_5, 1));
+
+                vs03 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs03_5, 0), _mm512_extracti32x8_epi32(vs03_5, 1));
+                vs13 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs13_5, 0), _mm512_extracti32x8_epi32(vs13_5, 1));
+                vs23 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs23_5, 0), _mm512_extracti32x8_epi32(vs23_5, 1));
+            }
+#endif
+            */
+            for (; k < vecsize; k += 32, rptr += 32 )
+            {
+                __m256i w0 = _mm256_load_si256((const __m256i*)(wptr0 + k));
+                __m256i w1 = _mm256_load_si256((const __m256i*)(wptr1 + k));
+                __m256i w2 = _mm256_load_si256((const __m256i*)(wptr2 + k));
+                __m256i r0 = _mm256_load_si256((const __m256i*)rptr);
+
+                vs00 = _mm256_fmaddepi8_epi32(w0, r0, vs00);
+                vs10 = _mm256_fmaddepi8_epi32(w1, r0, vs10);
+                vs20 = _mm256_fmaddepi8_epi32(w2, r0, vs20);
+
+                r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned));
+                vs01 = _mm256_fmaddepi8_epi32(w0, r0, vs01);
+                vs11 = _mm256_fmaddepi8_epi32(w1, r0, vs11);
+                vs21 = _mm256_fmaddepi8_epi32(w2, r0, vs21);
+
+                r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*2));
+                vs02 = _mm256_fmaddepi8_epi32(w0, r0, vs02);
+                vs12 = _mm256_fmaddepi8_epi32(w1, r0, vs12);
+                vs22 = _mm256_fmaddepi8_epi32(w2, r0, vs22);
+
+                r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*3));
+                vs03 = _mm256_fmaddepi8_epi32(w0, r0, vs03);
+                vs13 = _mm256_fmaddepi8_epi32(w1, r0, vs13);
+                vs23 = _mm256_fmaddepi8_epi32(w2, r0, vs23);
+            }
+
+            __m256i t0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs00, vs01), _mm256_hadd_epi32(vs02, vs03));
+            __m256i t1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs10, vs11), _mm256_hadd_epi32(vs12, vs13));
+            __m256i t2 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs20, vs21), _mm256_hadd_epi32(vs22, vs23));
+
+            t0 = _mm256_add_epi32(t0, _mm256_permute2x128_si256(t0, t0, 1));
+            t1 = _mm256_add_epi32(t1, _mm256_permute2x128_si256(t1, t1, 1));
+            t2 = _mm256_add_epi32(t2, _mm256_permute2x128_si256(t2, t2, 1));
+
+            __m128i s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = _mm_set1_epi32(bias0);
+                s1 = _mm_set1_epi32(bias1);
+                s2 = _mm_set1_epi32(bias2);
+            }
+            else
+            {
+                s0 = _mm_loadu_si128((__m128i*)(outptr0 + j));
+                s1 = _mm_loadu_si128((__m128i*)(outptr1 + j));
+                s2 = _mm_loadu_si128((__m128i*)(outptr2 + j));
+            }
+
+            s0 = _mm_add_epi32(s0, _mm256_castsi256_si128(t0));
+            s1 = _mm_add_epi32(s1, _mm256_castsi256_si128(t1));
+            s2 = _mm_add_epi32(s2, _mm256_castsi256_si128(t2));
+
+            if( finalOutput )
+            {
+                __m128i voutzp = _mm_set1_epi32(outZp);
+                __m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127);
+                s0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s0), _mm_set1_ps(mult0))));
+                s1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s1), _mm_set1_ps(mult1))));
+                s2 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s2), _mm_set1_ps(mult2))));
+
+                s0 = _mm_min_epi32(_mm_max_epi32(s0, outmin), outmax);
+                s1 = _mm_min_epi32(_mm_max_epi32(s1, outmin), outmax);
+                s2 = _mm_min_epi32(_mm_max_epi32(s2, outmin), outmax);
+            }
+            if( tail )
+            {
+                s0 =  _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr0 + j),  _mm_castsi128_ps(s0), mask));
+                s1 =  _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr1 + j),  _mm_castsi128_ps(s1), mask));
+                s2 =  _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr2 + j),  _mm_castsi128_ps(s2), mask));
+            }
+            _mm_storeu_si128((__m128i*)(outptr0 + j), s0);
+            _mm_storeu_si128((__m128i*)(outptr1 + j), s1);
+            _mm_storeu_si128((__m128i*)(outptr2 + j), s2);
+        }
+
+        for( ; j <= blockSize - 2; j += 2 )
+        {
+            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
+            const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned;
+            int s00, s01, s10, s11, s20, s21;
+
+            if( initOutput )
+            {
+                s00 = s01 = bias0;
+                s10 = s11 = bias1;
+                s20 = s21 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j]; s01 = outptr0[j+1];
+                s10 = outptr1[j]; s11 = outptr1[j+1];
+                s20 = outptr2[j]; s21 = outptr2[j+1];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                int8_t r = rptr0[k];
+                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
+                r = rptr1[k];
+                s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r;
+            }
+
+            if( finalOutput )
+            {
+                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
+                s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127);
+                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
+                s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127);
+                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
+                s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127);
+            }
+            outptr0[j] = s00;
+            outptr0[j+1] = s01;
+            outptr1[j] = s10;
+            outptr1[j+1] = s11;
+            outptr2[j] = s20;
+            outptr2[j+1] = s21;
+        }
+
+        for( ; j < blockSize; j++ )
+        {
+            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
+            int s00, s10, s20;
+
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                int8_t r = rptr0[k];
+                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
+            }
+
+            if( finalOutput )
+            {
+                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
+                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
+                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
+            }
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+    _mm256_zeroupper();
+}
+
+static inline void _mm256_expand_mul_add(const __m256i& a, const __m256i& b,
+                                         __m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3)
+{
+    __m256i a0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a));
+    __m256i a1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a, 1));
+
+    __m256i b0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(b));
+    __m256i b1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b, 1));
+
+    __m256i a0b0 = _mm256_mullo_epi16(a0, b0);
+    __m256i a1b1 = _mm256_mullo_epi16(a1, b1);
+
+    out0 = _mm256_add_epi32(out0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a0b0)));
+    out1 = _mm256_add_epi32(out1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a0b0, 1)));
+    out2 = _mm256_add_epi32(out2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a1b1)));
+    out3 = _mm256_add_epi32(out3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a1b1, 1)));
+}
+
+static inline void _mm256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b)
+{
+    __m256i t0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i t1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+    const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = _mm256_shuffle_epi8(t0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(t1, sh);
+    __m256i lo = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i hi = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    a = _mm256_unpacklo_epi64(lo, hi);
+    b = _mm256_unpackhi_epi64(lo, hi);
+}
+
+void fastDepthwiseConv( const int8_t* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const int* biasptr, const float* multptr,
+                     const int8_t* inptr_,
+                     int height, int width,
+                     int* outptr_,
+                     int out_d, int outH, int outW,
+                     int inpZp, int outZp)
+{
+    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                 w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                 w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float mult = multptr[out_d];
+    int bias = biasptr[out_d];
+    int biasCopy;
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const int8_t* imgptr0 = inptr_ + in_i*width;
+        const int8_t* imgptr1 = imgptr0 + dilation_h*width;
+        const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+        int out;
+        biasCopy = bias;
+        if (in_i < 0)
+        {
+            biasCopy += inpZp * (w00 + w01 + w02);
+            w00 = w01 = w02 = 0;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            biasCopy += inpZp * (w20 + w21 + w22);
+            w20 = w21 = w22 = 0;
+            imgptr2 = imgptr1;
+        }
+        int* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
+                  (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
+                  (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
+                  biasCopy + inpZp*(w00 + w10 + w20);
+            outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 32;
+            __m256i vw00 = _mm256_set1_epi8(w00), vw01 = _mm256_set1_epi8(w01), vw02 = _mm256_set1_epi8(w02),
+                    vw10 = _mm256_set1_epi8(w10), vw11 = _mm256_set1_epi8(w11), vw12 = _mm256_set1_epi8(w12),
+                    vw20 = _mm256_set1_epi8(w20), vw21 = _mm256_set1_epi8(w21), vw22 = _mm256_set1_epi8(w22);
+            __m256i vbias = _mm256_set1_epi32(biasCopy), voutzp = _mm256_set1_epi32(outZp),
+                    outmin = _mm256_set1_epi32(-128), outmax = _mm256_set1_epi32(127);
+            __m256 vmult = _mm256_set1_ps(mult);
+            __m256i vout0, vout1, vout2, vout3;
+
+            if( stride_w == 1 )
+            {
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1)
+                    {
+                        if (out_j <= pad_l)
+                            break;
+                        out_j = outW1 - VECSZ;
+                    }
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256i v00 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j)),
+                            v01 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w)),
+                            v02 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w*2)),
+                            v10 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j)),
+                            v11 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w)),
+                            v12 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w*2)),
+                            v20 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j)),
+                            v21 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w)),
+                            v22 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w*2));
+
+                    vout0 = vout1 = vout2 = vout3 = vbias;
+                    _mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                    vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult)));
+                    vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult)));
+                    vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult)));
+                    vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult)));
+
+                    vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax);
+                    vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax);
+                    vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax);
+                    vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax);
+
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j), vout0);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3);
+                }
+            }
+            else
+            {
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1)
+                    {
+                        if (out_j <= pad_l)
+                            break;
+                        out_j = outW1 - VECSZ;
+                    }
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    vout0 = vout1 = vout2 = vout3 = vbias;
+                    _mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                    vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult)));
+                    vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult)));
+                    vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult)));
+                    vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult)));
+
+                    vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax);
+                    vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax);
+                    vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax);
+                    vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax);
+
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j), vout0);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3);
+                }
+            }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
+                  (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
+                  (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            int s0 = 1, s1 = 1, s2 = 1;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0;
+                biasCopy += inpZp*(w00 + w10 + w20);
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0;
+                biasCopy += inpZp*(w01 + w11 + w21);
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0;
+                biasCopy += inpZp*(w02 + w12 + w22);
+            }
+            out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
+                  (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
+                  (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+    }
+    _mm256_zeroupper();
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        __m256i vs0 = _mm256_setzero_si256(), vs1 = _mm256_setzero_si256(),
+                vs2 = _mm256_setzero_si256(), vs3 = _mm256_setzero_si256(),
+                vs4 = _mm256_setzero_si256(), vs5 = _mm256_setzero_si256(),
+                vs6 = _mm256_setzero_si256(), vs7 = _mm256_setzero_si256();
+
+        __m128i voutzp = _mm_set1_epi32(outZp);
+        __m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127);
+
+        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
+        {
+            __m256i v = _mm256_load_si256((const __m256i*)(vec + k));
+
+            vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0);
+            vs1 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep)), v, vs1);
+            vs2 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*2)), v, vs2);
+            vs3 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*3)), v, vs3);
+            vs4 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*4)), v, vs4);
+            vs5 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*5)), v, vs5);
+            vs6 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*6)), v, vs6);
+            vs7 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*7)), v, vs7);
+        }
+
+        __m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs1), _mm256_hadd_epi32(vs2, vs3));
+        __m256i s1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs4, vs5), _mm256_hadd_epi32(vs6, vs7));
+
+        s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1));
+        s1 = _mm256_add_epi32(s1, _mm256_permute2x128_si256(s1, s1, 1));
+
+        __m128i t0 = _mm_add_epi32(_mm256_castsi256_si128(s0), _mm_loadu_si128((__m128i*)(bias + i)));
+        __m128i t1 = _mm_add_epi32(_mm256_castsi256_si128(s1), _mm_loadu_si128((__m128i*)(bias + i + 4)));
+
+        t0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t0), _mm_loadu_ps(multiplier + i))));
+        t1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t1), _mm_loadu_ps(multiplier + i + 4))));
+
+        t0 = _mm_min_epi32(_mm_max_epi32(t0, outmin), outmax);
+        t1 = _mm_min_epi32(_mm_max_epi32(t1, outmin), outmax);
+
+        _mm_storeu_si128((__m128i*)(dst + i), t0);
+        _mm_storeu_si128((__m128i*)(dst + i + 4), t1);
+    }
+
+    for( ; i < nvecs; i++ )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        __m256i vs0 = _mm256_setzero_si256();
+
+        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
+        {
+            __m256i v = _mm256_load_si256((const __m256i*)(vec + k));
+            vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0);
+        }
+
+        __m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs0), vs0);
+        s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1));
+        int temp = _mm_extract_epi32(_mm256_castsi256_si128(s0), 0);
+        dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]);
+    }
+
+    _mm256_zeroupper();
+}
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // namespace
diff --git a/modules/dnn/src/int8layers/pooling_layer.cpp b/modules/dnn/src/int8layers/pooling_layer.cpp
new file mode 100644
index 0000000000..20a0486a46
--- /dev/null
+++ b/modules/dnn/src/int8layers/pooling_layer.cpp
@@ -0,0 +1,595 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+#include <float.h>
+#include <algorithm>
+#include <numeric>
+using std::max;
+using std::min;
+
+namespace cv
+{
+namespace dnn
+{
+
+class PoolingLayerInt8Impl CV_FINAL : public PoolingLayerInt8
+{
+public:
+    PoolingLayerInt8Impl(const LayerParams& params)
+    {
+        computeMaxIdx = false;
+        globalPooling = false;
+        isGlobalPooling = std::vector<bool>(3, false);
+        output_zp = params.get<int>("zeropoints");
+        input_zp = params.get<int>("input_zeropoint", 0);
+        multiplier = params.get<float>("multiplier", 1.f);
+
+        hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
+        shapesInitialized = !hasDynamicShapes;
+
+        if (params.has("pool") || params.has("kernel_size") ||
+            params.has("kernel_w") || params.has("kernel_h"))
+        {
+            String pool = toLowerCase(params.get<String>("pool", "max"));
+            if (pool == "max")
+                type = MAX;
+            else if (pool == "ave")
+                type = AVE;
+            else if (pool == "sum")
+                type = SUM;
+            else
+                CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
+
+            getPoolingKernelParams(params, kernel_size, isGlobalPooling, pads_begin, pads_end, strides, padMode);
+            globalPooling = isGlobalPooling[0] || isGlobalPooling[1] || isGlobalPooling[2];
+        }
+        else
+            CV_Error(Error::StsBadArg, "Cannot determine pooling type");
+        setParamsFrom(params);
+        ceilMode = params.get<bool>("ceil_mode", true);
+        spatialScale = params.get<float>("spatial_scale", 1);
+        avePoolPaddedArea = params.get<bool>("ave_pool_padded_area", true);
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(!inputs.empty());
+        CV_Assert(outputs.size() == 1);
+
+        std::vector<int> inp;
+        std::vector<int> out;
+        for (int i = 2; i < inputs[0].dims; i++) {
+            inp.push_back(inputs[0].size[i]);
+            out.push_back(outputs[0].size[i]);
+        }
+        if (globalPooling) {
+            std::vector<size_t> finalKernel;
+            for (int i = 0; i < inp.size(); i++) {
+                int idx = isGlobalPooling.size() - inp.size() + i;
+                finalKernel.push_back(isGlobalPooling[idx] ? inp[i] : kernel_size[idx]);
+             }
+             kernel_size = finalKernel;
+         }
+
+        getConvPoolPaddings(inp, kernel_size, strides, padMode, pads_begin, pads_end);
+
+        if (inputs[0].dims == 3)
+        {
+            // Pool1D
+            kernel_size.assign(1, kernel_size[0]);
+            strides.assign(1, strides[0]);
+            pads_begin.assign(1, pads_begin[0]);
+            pads_end.assign(1, pads_end[0]);
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_OPENCV)
+        {
+            if (kernel_size.size() == 3)
+                return preferableTarget == DNN_TARGET_CPU;
+            if (kernel_size.size() <= 2)
+                return true;
+            else
+                return false;
+        }
+        return false;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            return activ_int8->blobs.empty();
+        }
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        switch (type)
+        {
+            case MAX:
+            {
+                CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
+                maxPooling(inputs[0], outputs[0]);
+                break;
+            }
+            case AVE: case SUM:
+                CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
+                avePooling(inputs[0], outputs[0]);
+                break;
+            default:
+                CV_Error(Error::StsNotImplemented, "Not implemented");
+                break;
+        }
+    }
+
+    class PoolingInvoker : public ParallelLoopBody
+    {
+    public:
+        const Mat* src, *rois;
+        Mat *dst;
+        int pad_l, pad_t, pad_r, pad_b;
+        bool avePoolPaddedArea;
+        int nstripes, inpZp, outZp;
+        std::vector<int> ofsbuf;
+        int poolingType;
+        float multiplier;
+        float spatialScale;
+
+        std::vector<size_t> pads_begin, pads_end;
+        std::vector<size_t> kernel_size;
+        std::vector<size_t> strides;
+
+        PoolingInvoker() : src(0), rois(0), dst(0), pad_l(0), pad_t(0), pad_r(0), pad_b(0),
+                           avePoolPaddedArea(false), nstripes(0), inpZp(0), outZp(0),
+                           poolingType(MAX), multiplier(1), spatialScale(0){}
+
+        static void run(const Mat& src, const Mat& rois, Mat& dst,
+                        std::vector<size_t> kernel_size, std::vector<size_t> strides,
+                        std::vector<size_t> pads_begin, std::vector<size_t> pads_end,
+                        bool avePoolPaddedArea, int poolingType, float spatialScale,
+                        float multiplier, int inpZp, int outZp, int nstripes)
+        {
+            CV_Assert_N(
+                      src.isContinuous(), dst.isContinuous(),
+                      src.type() == CV_8S, src.type() == dst.type(),
+                      src.dims == 3 || src.dims == 4 || src.dims == 5, dst.dims == 3 || dst.dims == 4 || dst.dims == 5,
+                      src.size[0] == dst.size[0], src.size[1] == dst.size[1], rois.empty());
+
+            PoolingInvoker p;
+
+            bool isPool1D = src.dims == 3;
+            bool isPool3D = src.dims == 5;
+
+            p.src = &src;
+            p.rois = &rois;
+            p.dst = &dst;
+
+            p.kernel_size = kernel_size;
+            p.strides = strides;
+            p.pads_begin = pads_begin;
+            p.pads_end = pads_end;
+
+            p.pad_l = pads_begin.back();
+            p.pad_t = isPool1D ? 0 : pads_begin[pads_begin.size() - 2];
+            p.pad_r = pads_end.back();
+            p.pad_b = isPool1D ? 0 : pads_end[pads_end.size() - 2];
+
+            p.avePoolPaddedArea = avePoolPaddedArea;
+            p.nstripes = nstripes;
+            p.inpZp = inpZp;
+            p.outZp = outZp;
+            p.poolingType = poolingType;
+            p.spatialScale = spatialScale;
+            p.multiplier = multiplier;
+
+            int height = isPool1D ? 1 : src.size[src.dims - 2];
+            int width = src.size[src.dims - 1];
+
+            int kernel_d = isPool3D ? kernel_size[0] : 1;
+            int kernel_h = isPool1D ? 1 : kernel_size[kernel_size.size() - 2];
+            int kernel_w = kernel_size.back();
+
+            p.ofsbuf.resize(kernel_d * kernel_h * kernel_w);
+            for (int i = 0; i < kernel_d; ++i) {
+                for (int j = 0; j < kernel_h; ++j) {
+                    for (int k = 0; k < kernel_w; ++k) {
+                        p.ofsbuf[i * kernel_h * kernel_w + j * kernel_w + k] = width * height * i + width * j + k;
+                    }
+                }
+            }
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int channels = dst->size[1];
+
+            bool isPool3D = src->dims == 5;
+            bool isPool2D = src->dims == 4;
+            bool isPool1D = src->dims == 3;
+            int depth = isPool3D? dst->size[2] : 1;
+            int height = isPool1D? 1 : dst->size[dst->dims - 2];
+            int width = dst->size[dst->dims - 1];
+
+            int inp_depth = isPool3D? src->size[2] : 1;
+            int inp_height = isPool1D? 1 : src->size[src->dims - 2];
+            int inp_width = src->size[src->dims - 1];
+
+            size_t total = dst->total();
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, total);
+
+            int kernel_d = isPool3D? kernel_size[0] : 1;
+            int kernel_h = isPool1D? 1 : kernel_size[kernel_size.size() - 2];
+            int kernel_w = kernel_size.back();
+
+            int stride_d = isPool3D? strides[0] : 0;
+            int stride_h = isPool1D? 1 :strides[strides.size() - 2];
+            int stride_w = strides.back();
+
+#if CV_SIMD128
+            const int* ofsptr = (const int*)&ofsbuf[0];
+            if (poolingType == MAX && !ofsptr)
+                CV_Error(Error::StsBadArg, "ofsbuf should be initialized in this mode");
+#endif
+
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ofs = ofs0;
+                int x0 = (int)(ofs % width);
+                ofs /= width;
+                int y0 = (int)(ofs % height);
+                ofs /= height;
+
+                int d0 = (int)(ofs % depth);
+                ofs /= depth;
+
+                int c = (int)(ofs % channels);
+                int n = (int)(ofs / channels);
+                int ystart, yend;
+                int dstart = 0, dend = 1;
+
+                const int8_t *srcData = 0;
+                int pad_d_begin = (pads_begin.size() == 3) ? pads_begin[0] : 0;
+                dstart = d0 * stride_d - pad_d_begin;
+                dend = min(dstart + kernel_d, (int)(inp_depth + pads_end[0]));
+
+                ystart = y0 * stride_h - pad_t;
+                yend = min(ystart + kernel_h, inp_height + pad_b);
+                srcData = src->ptr<int8_t>(n, c);
+
+                int ddelta = dend - dstart;
+                dstart = max(dstart, 0);
+                dend = min(dend, inp_depth);
+                int ydelta = yend - ystart;
+                ystart = max(ystart, 0);
+                yend = min(yend, inp_height);
+                int8_t *dstData = &dst->ptr<int8_t>(n, c, d0)[y0 * width];
+
+                int delta = std::min((int)(stripeEnd - ofs0), width - x0);
+                ofs0 += delta;
+                int x1 = x0 + delta;
+
+                if( poolingType == MAX )
+                    for( ; x0 < x1; x0++ )
+                    {
+                        int xstart = x0 * stride_w - pad_l;
+                        int xend = min(xstart + kernel_w, inp_width);
+                        xstart = max(xstart, 0);
+                        if (xstart >= xend || ystart >= yend)
+                        {
+                            dstData[x0] = (int8_t)outZp;
+                            continue;
+                        }
+#if CV_SIMD128
+                        if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width )
+                        {
+                            v_int8x16 max_val0 = v_setall_s8(-128);
+                            if( yend - ystart == kernel_h )
+                            {
+                                const int8_t* srcData1 = srcData + ystart*inp_width + xstart;
+                                if( stride_w == 1 )
+                                    for (int k = 0; k < kernel_w*kernel_h; k++)
+                                    {
+                                        int index = ofsptr[k];
+                                        v_int8x16 v0 = v_load(srcData1 + index);
+                                        max_val0 = v_max(max_val0, v0);
+                                    }
+                                else if( stride_w == 2 )
+                                    for (int k = 0; k < kernel_w*kernel_h; k++)
+                                    {
+                                        int index = ofsptr[k];
+                                        v_int8x16 v0, dummy;
+                                        v_load_deinterleave(srcData1 + index, v0, dummy);
+                                        max_val0 = v_max(max_val0, v0);
+                                    }
+                                else
+                                    for (int k = 0; k < kernel_w*kernel_h; k++)
+                                    {
+                                        int index = ofsptr[k];
+                                        v_int8x16 v0(srcData1[index], srcData1[index + stride_w],
+                                                     srcData1[index + stride_w*2], srcData1[index + stride_w*3],
+                                                     srcData1[index + stride_w*4], srcData1[index + stride_w*5],
+                                                     srcData1[index + stride_w*6], srcData1[index + stride_w*7],
+                                                     srcData1[index + stride_w*8], srcData1[index + stride_w*9],
+                                                     srcData1[index + stride_w*10], srcData1[index + stride_w*11],
+                                                     srcData1[index + stride_w*12], srcData1[index + stride_w*13],
+                                                     srcData1[index + stride_w*14], srcData1[index + stride_w*15]);
+                                        max_val0 = v_max(max_val0, v0);
+                                    }
+                            }
+                            else
+                            {
+                                for (int y = ystart; y < yend; ++y)
+                                {
+                                    for (int x = xstart; x < xend; ++x)
+                                    {
+                                        const int index = y * inp_width + x;
+                                        v_int8x16 v0(srcData[index], srcData[index + stride_w],
+                                                     srcData[index + stride_w*2], srcData[index + stride_w*3],
+                                                     srcData[index + stride_w*4], srcData[index + stride_w*5],
+                                                     srcData[index + stride_w*6], srcData[index + stride_w*7],
+                                                     srcData[index + stride_w*8], srcData[index + stride_w*9],
+                                                     srcData[index + stride_w*10], srcData[index + stride_w*11],
+                                                     srcData[index + stride_w*12], srcData[index + stride_w*13],
+                                                     srcData[index + stride_w*14], srcData[index + stride_w*15]);
+                                        max_val0 = v_max(max_val0, v0);
+                                    }
+                                }
+                            }
+                            v_store(dstData + x0, max_val0);
+                            x0 += 15;
+                        }
+                        else
+#else
+                        CV_UNUSED(isPool2D);
+#endif
+                        if( isPool1D )
+                        {
+                            const int8_t* first = srcData + xstart;
+                            const int8_t* last = srcData + xend;
+                            const int8_t* max_elem = std::max_element(first, last);
+                            if (max_elem != last)
+                                dstData[x0] = *max_elem;
+                        }
+                        else
+                        {
+                            int8_t max_val = -128;
+                            for (int d = dstart; d < dend; ++d) {
+                                for (int y = ystart; y < yend; ++y) {
+                                    for (int x = xstart; x < xend; ++x) {
+                                        const int index = d * inp_width * inp_height + y * inp_width + x;
+                                        int8_t val = srcData[index];
+                                        max_val = std::max(max_val, val);
+                                    }
+                                }
+                            }
+                            dstData[x0] = max_val;
+                        }
+                    }
+                else if (poolingType == AVE || poolingType == SUM)
+                {
+                    for( ; x0 < x1; ++x0)
+                    {
+                        int xstart = x0 * stride_w - pad_l;
+                        int xend = min(xstart + kernel_w, inp_width + pad_r);
+                        int xdelta = xend - xstart;
+                        xstart = max(xstart, 0);
+                        xend = min(xend, inp_width);
+
+                        int real_kernel_area = (dend - dstart) * (yend - ystart) * (xend - xstart);
+                        int padded_kernel_area = xdelta * ydelta * ddelta;
+                        int kernel_area = avePoolPaddedArea ? padded_kernel_area : real_kernel_area;
+
+                        int bias = (avePoolPaddedArea ? (padded_kernel_area - real_kernel_area) * inpZp : 0)
+                                 - (inpZp * kernel_area);
+                        float inv_kernel_area = poolingType == AVE ? multiplier / kernel_area : multiplier;
+#if CV_SIMD128
+                        if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width )
+                        {
+                            v_int32x4 sum_val0 = v_setall_s32(bias), sum_val1 = v_setall_s32(bias),
+                                      sum_val2 = v_setall_s32(bias), sum_val3 = v_setall_s32(bias),
+                                      voutzp = v_setall_s32(outZp);
+                            v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
+
+                            for (int y = ystart; y < yend; ++y)
+                            {
+                                for (int x = xstart; x < xend; ++x)
+                                {
+                                    const int index = y * inp_width + x;
+                                    v_int32x4 v0((int)srcData[index], (int)srcData[index + stride_w],
+                                                 (int)srcData[index + stride_w*2], (int)srcData[index + stride_w*3]);
+                                    v_int32x4 v1((int)srcData[index + stride_w*4], (int)srcData[index + stride_w*5],
+                                                 (int)srcData[index + stride_w*6], (int)srcData[index + stride_w*7]);
+                                    v_int32x4 v2((int)srcData[index + stride_w*8], (int)srcData[index + stride_w*9],
+                                                 (int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]);
+                                    v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13],
+                                                 (int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]);
+                                    sum_val0 += v0;
+                                    sum_val1 += v1;
+                                    sum_val2 += v2;
+                                    sum_val3 += v3;
+                                }
+                            }
+
+                            sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp;
+                            sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp;
+                            sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp;
+                            sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp;
+
+                            v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3)));
+                            x0 += 15;
+                        }
+                        else
+#endif
+                        if( isPool1D )
+                        {
+                            const int8_t* first = srcData + xstart;
+                            const int8_t* last = srcData + xend;
+                            int sum_val = bias + std::accumulate(first, last, 0);
+                            dstData[x0] = saturate_cast<int8_t>(outZp + std::round(sum_val*inv_kernel_area));
+                        }
+                        else
+                        {
+                            int sum_val = bias;
+                            for (int d = dstart; d < dend; ++d) {
+                                for (int y = ystart; y < yend; ++y) {
+                                    for (int x = xstart; x < xend; ++x) {
+                                        const int index = d * inp_width * inp_height + y * inp_width + x;
+                                        int8_t val = srcData[index];
+                                        sum_val += (int)val;
+                                    }
+                                }
+                            }
+                            dstData[x0] = saturate_cast<int8_t>(outZp + std::round(sum_val*inv_kernel_area));
+                        }
+                    }
+                }
+            }
+        }
+    };
+
+    void maxPooling(Mat &src, Mat &dst)
+    {
+        const int nstripes = getNumThreads();
+        Mat rois;
+        PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type,
+                            spatialScale, multiplier, input_zp, output_zp, nstripes);
+    }
+
+    void avePooling(Mat &src, Mat &dst)
+    {
+        const int nstripes = getNumThreads();
+        Mat rois;
+        PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type,
+                            spatialScale, multiplier, input_zp, output_zp, nstripes);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() != 0);
+
+        bool isPool1D = inputs[0].size() == 3;
+        std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
+        std::vector<int> outShape(inputs[0].begin(), inputs[0].begin() + 2);
+
+        std::vector<size_t> local_kernel;
+        if (globalPooling) {
+            for (int i = 0; i < inpShape.size(); i++) {
+                int idx = isGlobalPooling.size() - inpShape.size() + i;
+                local_kernel.push_back(isGlobalPooling[idx] ? inpShape[i] : kernel_size[idx]);
+            }
+        } else {
+            local_kernel = kernel_size;
+        }
+
+        if (hasDynamicShapes && !shapesInitialized)
+        {
+            //Just copy input shapes for width and height to prevent errors on loading stage
+            for (int i = 0; i < inpShape.size(); i++)
+                outShape.push_back(inpShape[i]);
+        }
+        else if (padMode.empty())
+        {
+            int addedDims = isPool1D? inpShape.size() : local_kernel.size();
+            for (int i = 0; i < addedDims; i++) {
+                float dst = (float) (inpShape[i] + pads_begin[i] + pads_end[i] - local_kernel[i]) / strides[i];
+                outShape.push_back(1 + (ceilMode ? ceil(dst) : floor(dst)));
+            }
+
+            // If we have padding, ensure that the last pooling starts strictly
+            // inside the image (instead of at the padding); otherwise clip the last.
+            for (int i = 0; i < addedDims; i++) {
+                if (pads_end[i] && (outShape[2 + i] - 1) * strides[i] >= inpShape[i] + pads_end[i]) {
+                    --outShape[2 + i];
+                    CV_Assert((outShape[2 + i] - 1) * strides[i] < inpShape[i] + pads_end[i]);
+                }
+            }
+        }
+        else {
+            getConvPoolOutParams(inpShape, local_kernel, strides, padMode,
+                                 std::vector<size_t>(local_kernel.size(), 1), outShape);
+        }
+
+        outputs.assign(1, outShape);
+        return false;
+    }
+
+    bool updateMemoryShapes(const std::vector<MatShape> &inputs) CV_OVERRIDE
+    {
+        int dims = inputs[0].size();
+        CV_Assert(inputs[0][dims - 1] > 0 && inputs[0][dims - 2] > 0);
+        shapesInitialized = true;
+        return true;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(inputs); // suppress unused variable warning
+        long flops = 0;
+        bool isPool1D = inputs[0].size() == 3;
+        size_t karea = std::accumulate(kernel_size.begin(), isPool1D? kernel_size.begin() + 1 : kernel_size.end(),
+                                    1, std::multiplies<size_t>());
+        for(int i = 0; i < outputs.size(); i++)
+        {
+            if (type == MAX)
+            {
+                if (i%2 == 0)
+                    flops += total(outputs[i])*karea;
+            }
+            else
+            {
+                flops += total(outputs[i])*(karea + 1);
+            }
+        }
+        return flops;
+    }
+private:
+    enum Type
+    {
+        MAX,
+        AVE,
+        STOCHASTIC,
+        SUM,
+        ROI,   // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
+        PSROI  // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
+    };
+    bool hasDynamicShapes;
+    bool shapesInitialized;
+    float multiplier;
+};
+
+Ptr<PoolingLayerInt8> PoolingLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<PoolingLayerInt8>(new PoolingLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp b/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp
new file mode 100644
index 0000000000..2ddb76a0e8
--- /dev/null
+++ b/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+
+class QuantizeLayerImpl CV_FINAL : public QuantizeLayer
+{
+public:
+    QuantizeLayerImpl(const LayerParams& params)
+    {
+        scale = params.get<float>("scales", 1.0f);
+        zeropoint = params.get<int>("zeropoints", 0);
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs, outputs;
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        if (inputs_.depth() == CV_16S)
+        {
+            UMat inputFp32(shape(inputs[0]), CV_32F);
+            convertFp16(inputs[0], inputFp32);
+            inputFp32.copyTo(inputs[0]);
+        }
+
+        inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint);
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint);
+    }
+};
+
+class DequantizeLayerImpl CV_FINAL : public DequantizeLayer
+{
+public:
+    DequantizeLayerImpl(const LayerParams& params)
+    {
+        scale = params.get<float>("scales", 1.0f);
+        zeropoint = params.get<int>("zeropoints", 0);
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs, outputs;
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        UMat outputFp32(shape(outputs[0]), CV_32F);
+        inputs[0].convertTo(outputFp32, CV_32F, scale, -(scale*zeropoint));
+
+        if (outputs_.depth() == CV_16S)
+            convertFp16(outputFp32, outputs[0]);
+        else
+            outputFp32.copyTo(outputs[0]);
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        inputs[0].convertTo(outputs[0], CV_32F, scale, -(scale*zeropoint));
+    }
+};
+
+Ptr<QuantizeLayer> QuantizeLayer::create(const LayerParams& params)
+{
+    return Ptr<QuantizeLayer>(new QuantizeLayerImpl(params));
+}
+
+Ptr<DequantizeLayer> DequantizeLayer::create(const LayerParams& params)
+{
+    return Ptr<DequantizeLayer>(new DequantizeLayerImpl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/scale_layer.cpp b/modules/dnn/src/int8layers/scale_layer.cpp
new file mode 100644
index 0000000000..d7f676d047
--- /dev/null
+++ b/modules/dnn/src/int8layers/scale_layer.cpp
@@ -0,0 +1,211 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class ScaleLayerInt8Impl CV_FINAL : public ScaleLayerInt8
+{
+public:
+    Mat weights, bias;
+    ScaleLayerInt8Impl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        hasBias = params.get<bool>("bias_term", false);
+        axis = params.get<int>("axis", 1);
+        hasWeights = false;
+
+        output_sc = params.get<float>("scales");
+        output_zp = params.get<int>("zeropoints");
+
+        DictValue inpSc = params.get("input_scales");
+        DictValue inpZp = params.get("input_zeropoints");
+
+        for (int i = 0; i < inpSc.size(); i++)
+        {
+            inp_sc.push_back(inpSc.get<float>(i));
+            inp_zp.push_back(inpZp.get<int>(i));
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        outputs.assign(1, inputs[0]);
+        return true;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        hasWeights = blobs.size() == 2 || (blobs.size() <= 1 && !hasBias);
+        CV_Assert((inputs.size() == 2 && blobs.empty()) || blobs.size() == (int)hasWeights + (int)hasBias);
+
+        if (!blobs.empty())
+        {
+            Mat w = hasWeights ? blobs[0] : Mat::ones(blobs[0].size(), CV_32F);
+            Mat b = hasBias ? blobs.back() : Mat::zeros(blobs.back().size(), CV_32F);
+
+            w = w.reshape(1, 1);
+            b = b.reshape(1, 1);
+
+            w.convertTo(weights, CV_32F, inp_sc[0]/output_sc);
+            addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F);
+        }
+        else
+        {
+            // initialized during forward()
+            weights = Mat(); bias = Mat();
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            return activ_int8->blobs.empty();
+        }
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        Mat &inpBlob = inputs[0];
+        Mat &outBlob = outputs[0];
+
+        if (blobs.empty())
+        {
+            CV_Assert(inp_sc.size() == 2 && inp_zp.size() == 2);
+            Mat inp_dequantized, w, b;
+            inputs[1].reshape(1, 1).convertTo(inp_dequantized, CV_32F, inp_sc[1], -(inp_sc[1]*inp_zp[1]));
+            w = hasWeights ? inp_dequantized : Mat::ones(inp_dequantized.size(), CV_32F);
+            b = hasBias ? inp_dequantized : Mat::zeros(inp_dequantized.size(), CV_32F);
+
+            w.convertTo(weights, CV_32F, inp_sc[0]/output_sc);
+            addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F);
+        }
+
+        MatShape inpShape = shape(inpBlob);
+        const int numWeights = weights.total();
+        CV_Assert(numWeights != 0);
+        CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs");
+
+        int endAxis;
+        for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis)
+        {
+            if (total(inpShape, axis, endAxis) == numWeights)
+                break;
+        }
+        CV_Assert(total(inpShape, axis, endAxis) == numWeights);
+        CV_CheckTypeEQ(inpBlob.type(), CV_8SC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_8SC1, "");
+
+        int numSlices = total(inpShape, 0, axis);
+        int8_t* inpData = (int8_t*)inpBlob.data;
+        int8_t* outData = (int8_t*)outBlob.data;
+
+        if (endAxis != inpBlob.dims)
+        {
+            float* weightsData = (float*)weights.data;
+            float* biasesData = (float*)bias.data;
+            int spatialSize = total(inpShape, endAxis);  // spatialSize != 1
+            for (int i = 0; i < numSlices; ++i)
+            {
+                for (int j = 0; j < numWeights; ++j)
+                {
+                    float w = weightsData[j];
+                    float b = biasesData[j];
+                    Mat inpSlice(1, spatialSize, CV_8S, inpData);
+                    Mat outSlice(1, spatialSize, CV_8S, outData);
+                    inpSlice.convertTo(outSlice, CV_8S, w, b);
+                    inpData += spatialSize;
+                    outData += spatialSize;
+                }
+            }
+        }
+        else
+        {
+            for (int i = 0; i < numSlices; ++i)
+            {
+                Mat inpSlice(1, numWeights, CV_8S, inpData);
+                Mat outSlice(1, numWeights, CV_8S, outData);
+
+                multiply(inpSlice, weights, outSlice, 1.0, CV_8S);
+                add(outSlice, bias, outSlice, Mat(), CV_8S);
+
+                inpData += numWeights;
+                outData += numWeights;
+            }
+        }
+    }
+
+    void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        scale = (hasWeights && !blobs.empty()) ? blobs[0] : Mat();
+        shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
+    }
+
+    void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE
+    {
+        scale = output_sc;
+        zeropoint = output_zp;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        long flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 2*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    bool hasWeights;
+    std::vector<float> inp_sc;
+    std::vector<int> inp_zp;
+};
+
+
+Ptr<ScaleLayerInt8> ScaleLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<ScaleLayerInt8>(new ScaleLayerInt8Impl(params));
+}
+
+Ptr<Layer> ShiftLayerInt8::create(const LayerParams& params)
+{
+    LayerParams scaleParams = params;
+    scaleParams.type = "ScaleInt8";
+    scaleParams.set("bias_term", true);
+    scaleParams.set("axis", 0);
+    return Ptr<ScaleLayerInt8>(new ScaleLayerInt8Impl(scaleParams));
+}
+
+}  // namespace dnn
+}  // namespace cv
diff --git a/modules/dnn/src/int8layers/softmax_layer.cpp b/modules/dnn/src/int8layers/softmax_layer.cpp
new file mode 100644
index 0000000000..7e3c82bc21
--- /dev/null
+++ b/modules/dnn/src/int8layers/softmax_layer.cpp
@@ -0,0 +1,176 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <algorithm>
+#include <stdlib.h>
+
+namespace cv
+{
+namespace dnn
+{
+
+class SoftMaxLayerInt8Impl CV_FINAL : public SoftmaxLayerInt8
+{
+public:
+
+    SoftMaxLayerInt8Impl(const LayerParams& params)
+    {
+        axisRaw = params.get<int>("axis", 1);
+        logSoftMax = params.get<bool>("log_softmax", false);
+        output_sc = params.get<float>("scales");
+        output_zp = params.get<int>("zeropoints");
+        setParamsFrom(params);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        MatShape shape = inputs[0];
+        int cAxis = normalize_axis(axisRaw, shape.size());
+        shape[cAxis] = 1;
+        internals.assign(1, shape);
+        return inplace;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Ptr<DequantizeLayer> dequantize_layer = top.dynamicCast<DequantizeLayer>();
+        return !dequantize_layer.empty() && preferableTarget != DNN_TARGET_OPENCL_FP16;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        const Mat &src = inputs[0];
+        Mat &dst = outputs[0];
+
+        int axis = normalize_axis(axisRaw, src.dims);
+        size_t outerSize = src.total(0, axis), channels = src.size[axis],
+               innerSize = src.total(axis + 1);
+
+        CV_Assert(src.type() == CV_8S && (dst.type() == CV_8S || dst.type() == CV_32F));
+        CV_Assert(src.isContinuous() && dst.isContinuous());
+
+        size_t outerStep = src.total(axis);
+        size_t cnStep = src.total(axis + 1);
+        const int8_t *srcPtr = src.ptr<int8_t>();
+        const float *expPtr = blobs[0].ptr<float>();
+
+        if (dst.type() == CV_32F)
+        {
+            float *dstPtr = dst.ptr<float>();
+            for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+            {
+                size_t srcOffset = outerDim * outerStep;
+                std::vector<float> expSum(innerSize, 0.f);
+
+                // sum exp along axis
+                for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                {
+                    const int offset = srcOffset + cnDim * cnStep;
+                    for (size_t i = 0; i < innerSize; i++)
+                        expSum[i] += expPtr[srcPtr[offset + i] + 128];
+                }
+
+                // divide by computed sum
+                for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                {
+                    const int offset = srcOffset + cnDim * cnStep;
+                    for (size_t i = 0; i < innerSize; i++)
+                        dstPtr[offset + i] = expPtr[srcPtr[offset + i] + 128]/expSum[i];
+                }
+
+                if (logSoftMax)
+                {
+                    for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                    {
+                        const int offset = srcOffset + cnDim * cnStep;
+                        for (size_t i = 0; i < innerSize; i++)
+                            dstPtr[offset + i] = log(dstPtr[offset + i]);
+                    }
+                }
+            }
+        }
+        else
+        {
+            const float inv_scale = 1.f/output_sc;
+            int8_t *dstPtr = dst.ptr<int8_t>();
+            for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+            {
+                size_t srcOffset = outerDim * outerStep;
+                std::vector<float> expSum(innerSize, 0.f);
+
+                // sum exp along axis
+                for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                {
+                    const int offset = srcOffset + cnDim * cnStep;
+                    for (size_t i = 0; i < innerSize; i++)
+                        expSum[i] += expPtr[srcPtr[offset + i] + 128];
+                }
+
+                // divide by computed sum and quantize to int8
+                if (logSoftMax)
+                {
+                    for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                    {
+                        const int offset = srcOffset + cnDim * cnStep;
+                        for (size_t i = 0; i < innerSize; i++)
+                            dstPtr[offset + i] = saturate_cast<int8_t>(output_zp + std::round(inv_scale*log(expPtr[srcPtr[offset + i] + 128]/expSum[i])));
+                    }
+                }
+                else
+                {
+                    for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                    {
+                        const int offset = srcOffset + cnDim * cnStep;
+                        for (size_t i = 0; i < innerSize; i++)
+                            dstPtr[offset + i] = saturate_cast<int8_t>(output_zp + std::round(inv_scale*(expPtr[srcPtr[offset + i] + 128]/expSum[i])));
+                    }
+                }
+            }
+        }
+    }
+
+    int64 getFLOPS(const std::vector<MatShape> &inputs,
+                  const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        int64 flops = 0;
+
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            flops += 4*total(inputs[i]);
+        }
+
+        return flops;
+    }
+
+    int axisRaw;
+};
+
+Ptr<SoftmaxLayerInt8> SoftmaxLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<SoftmaxLayerInt8>(new SoftMaxLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index e28c964689..49804c5c13 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -409,6 +409,18 @@ public:
     }
 #endif  // HAVE_DNN_NGRAPH
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        params.set("input_scale", scales[0][0]);
+        params.set("input_zeropoint", zeropoints[0][0]);
+
+        params.blobs.clear();
+        params.blobs.push_back(origin_weights);
+        params.blobs.push_back(origin_bias);
+        return true;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 5f93b45886..59548a9c0c 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -166,6 +166,11 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 };
 
 Ptr<Layer> BlankLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index a950c56167..536114fcd7 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -70,6 +70,7 @@ public:
         setParamsFrom(params);
         axis = params.get<int>("axis", 1);
         padding = params.get<bool>("padding", false);
+        paddingValue = params.get<int>("padding_value", 0);
     }
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -119,13 +120,14 @@ public:
                (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding);
     }
 
+    template <class T>
     class ChannelConcatInvoker : public ParallelLoopBody
     {
     public:
         std::vector<Mat>* inputs;
         Mat* output;
         int nstripes;
-        std::vector<const float*> chptrs;
+        std::vector<const T*> chptrs;
 
         static void run(std::vector<Mat>& inputs, Mat& output, int nstripes)
         {
@@ -139,14 +141,14 @@ public:
             for( i = 0; i < ninputs; i++ )
             {
                 Mat& inp = inputs[i];
-                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
                            inp.dims == 4 && inp.size[0] == output.size[0] &&
                            inp.size[2] == output.size[2] &&
                            inp.size[3] == output.size[3] );
                 nchannels += inp.size[1];
             }
             CV_Assert( nchannels == output.size[1] );
-            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
 
             cc.chptrs.resize(nchannels*batchsz);
 
@@ -157,7 +159,7 @@ public:
                 for( int j = 0; j < batchsz; j++ )
                     for( int k = 0; k < inp.size[1]; k++ )
                     {
-                        const float* ptr = inp.ptr<float>(j, k);
+                        const T* ptr = inp.ptr<T>(j, k);
                         cc.chptrs[ofs + j*nchannels + k] = ptr;
                     }
                 ofs += inp.size[1];
@@ -176,8 +178,8 @@ public:
             size_t stripeSize = (total + nstripes - 1)/nstripes;
             size_t stripeStart = r.start*stripeSize;
             size_t stripeEnd = std::min(total, r.end*stripeSize);
-            const float** ptrs = (const float**)&chptrs[0];
-            float* outptr = output->ptr<float>();
+            const T** ptrs = (const T**)&chptrs[0];
+            T* outptr = output->ptr<T>();
             size_t blockSize0 = 1 << 16;
 
             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
@@ -248,7 +250,8 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   inputs_arr.depth() != CV_8S,
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
         std::vector<Mat> inputs, outputs;
@@ -259,12 +262,15 @@ public:
         Mat& outMat = outputs[0];
 
         if (padding)
-            outMat.setTo(0);
+            outMat.setTo(paddingValue);
 
         if( cAxis == 1 && outMat.dims == 4 && !padding)
         {
             int nstripes = getNumThreads();
-            ChannelConcatInvoker::run(inputs, outMat, nstripes);
+            if (outMat.type() == CV_8S)
+                ChannelConcatInvoker<int8_t>::run(inputs, outMat, nstripes);
+            else
+                ChannelConcatInvoker<float>::run(inputs, outMat, nstripes);
         }
         else
         {
@@ -394,6 +400,14 @@ public:
         return Ptr<BackendNode>(new InfEngineNgraphNode(concat));
     }
 #endif  // HAVE_DNN_NGRAPH
+
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (padding)
+            params.set("padding_value", zeropoints[1][0]);
+        return true;
+    }
 };
 
 Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp
index bbea3e3f2c..18f190b36b 100644
--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@@ -112,6 +112,15 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        Mat quantizedBlob;
+        blobs[0].convertTo(quantizedBlob, CV_8S, 1.f/scales[1][0], zeropoints[1][0]);
+        params.blobs.clear();
+        params.blobs.push_back(quantizedBlob);
+        return true;
+    }
 };
 
 Ptr<Layer> ConstLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 68c543be24..ec2904ee69 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -2083,6 +2083,48 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        // References - https://arxiv.org/pdf/1712.05877.pdf
+
+        // Quantized convolution with variable weights is not supported.
+        if (blobs.empty())
+            return false;
+
+        float inputScale = scales[0][0], outputScale = scales[1][0];
+        int inputZp = zeropoints[0][0];
+        params.set("input_zeropoint", inputZp);
+
+        Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
+        Mat biasQuantized(1, numOutput, CV_32S);
+        Mat outputMultiplier(1, numOutput, CV_32F);
+        double realMin, realMax, weightsScale;
+
+        for( int i = 0; i < numOutput; i++ )
+        {
+            // Quantize weights
+            cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
+            realMin = std::min(realMin, 0.0);
+            realMax = std::max(realMax, 0.0);
+            weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
+            weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
+
+            // Quantize biases
+            float biasScale = inputScale * weightsScale;
+            biasQuantized.at<int>(i) = (int)std::round(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+
+            // Store multiplier
+            outputMultiplier.at<float>(i) = biasScale / outputScale;
+        }
+
+        params.blobs.clear();
+        params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
+        params.blobs.push_back(biasQuantized);
+        params.blobs.push_back(outputMultiplier);
+        return true;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 9bb5be342f..6dc1813c8b 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -255,6 +255,12 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return func.tryQuantize(scales, zeropoints, params);
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
@@ -288,6 +294,8 @@ struct BaseFunctor
     bool tryFuse(Ptr<dnn::Layer>&) { return false; }
 
     void getScaleShift(Mat&, Mat&) const {}
+
+    bool tryQuantize(const std::vector<std::vector<float>>&, const std::vector<std::vector<int>>&, LayerParams&) { return false; }
 };
 
 struct ReLUFunctor : public BaseFunctor
@@ -436,6 +444,29 @@ struct ReLUFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        if (slope != 0.f)
+        {
+            float inpScale = scales[0][0], outScale = scales[1][0];
+            int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+            Mat lookUpTable(1, 256, CV_8S);
+            int8_t* table = lookUpTable.ptr<int8_t>();
+            for (int i = -128; i < 128; i++)
+            {
+                float x = inpScale*(i - inpZp);
+                float y = x >= 0.f ? x : slope*x;
+                int quantized = outZp + (int)std::round(y/outScale);
+                table[i+128] = saturate_cast<int8_t>(quantized);
+            }
+            params.blobs.clear();
+            params.blobs.push_back(lookUpTable);
+        }
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -559,6 +590,12 @@ struct ReLU6Functor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 2; }
 };
 
@@ -651,6 +688,26 @@ struct TanHFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = tanh(x);
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -743,6 +800,26 @@ struct SwishFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = x / (1.0f + exp(-x));
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -848,6 +925,28 @@ struct MishFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float eX = exp(x);
+            float n = (eX + 2) * eX;
+            float y = (x * n) / (n + 2);
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -940,6 +1039,26 @@ struct SigmoidFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = 1.f/(1.f + exp(-x));
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -1032,6 +1151,26 @@ struct ELUFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = x >= 0.f ? x : exp(x) - 1;
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 2; }
 };
 
@@ -1130,6 +1269,26 @@ struct AbsValFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = abs(x);
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -1223,6 +1382,26 @@ struct BNLLFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = x > 0 ? x + log(1. + exp(-x)) : log(1. + exp(x));
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 5; }
 };
 
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index a337c48d9e..860560213d 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -864,6 +864,37 @@ public:
     }
 #endif  // HAVE_DNN_NGRAPH
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (op == SUM)
+        {
+            std::vector<float> newCoeffs;
+            float offset = zeropoints[1][0];
+            float out_sc = scales[1][0];
+            for (int i = 0; i < scales[0].size(); i++)
+            {
+                float coeff = coeffs.empty() ? 1.f : coeffs[i];
+                float newcoeff = (scales[0][i] * coeff) / out_sc;
+                newCoeffs.push_back(newcoeff);
+                offset -= (newcoeff * zeropoints[0][i]);
+            }
+            params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
+            params.set("offset", offset);
+            return true;
+        }
+        else if (op == PROD)
+        {
+            std::vector<float> newCoeffs = scales[0];
+            newCoeffs[0] /= scales[1][0];
+            params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
+            params.set("offset", zeropoints[1][0]);
+            params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+            return true;
+        }
+        return op == MAX;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index 7cf01a14fa..8ff862fab0 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -227,6 +227,11 @@ virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inp
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 
     int _startAxis;
     int _endAxis;
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 529f3c04fd..28ea7f347f 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -618,6 +618,45 @@ public:
     }
 #endif  // HAVE_DNN_NGRAPH
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (blobs.empty())
+            return false;
+
+        int numOutput = blobs[0].size[0];
+        float inputScale = scales[0][0], outputScale = scales[1][0];
+        int inputZp = zeropoints[0][0];
+
+        Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
+        Mat biasQuantized(1, numOutput, CV_32S);
+        Mat outputMultiplier(1, numOutput, CV_32F);
+
+        double realMin, realMax, weightsScale;
+        for( int i = 0; i < numOutput; i++ )
+        {
+            // Quantize weights
+            cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
+            realMin = std::min(realMin, 0.0);
+            realMax = std::max(realMax, 0.0);
+            weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
+            weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
+
+            // Quantize biases
+            float biasScale = inputScale * weightsScale;
+            biasQuantized.at<int>(i) = (int)std::round(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+
+            // Store multiplier
+            outputMultiplier.at<float>(i) = biasScale / outputScale;
+        }
+
+        params.blobs.clear();
+        params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
+        params.blobs.push_back(biasQuantized);
+        params.blobs.push_back(outputMultiplier);
+        return true;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp
index d182568795..c1979ce701 100644
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@@ -134,6 +134,8 @@ public:
                 cv::convertFp16(paddingValue_fp32, paddingValue_fp16);
                 outputs[0].setTo(paddingValue_fp16[0]);
             }
+            else if (inputs_arr.depth() == CV_8S)
+                outputs[0].setTo(saturate_cast<int8_t>(paddingValue));
             else
                 outputs[0].setTo(paddingValue);
             inputs[0].copyTo(outputs[0](dstRanges));
@@ -264,6 +266,16 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        float outputScale = scales[1][0];
+        int outputZp = zeropoints[1][0];
+        float padValue = outputZp + std::round(params.get<float>("value", 0)/outputScale);
+        params.set("value", padValue);
+        return true;
+    }
+
 private:
     std::vector<std::pair<int, int> > paddings;  // Pairs pad before, pad after.
     std::vector<Range> dstRanges;
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index c525c3f82f..77c2469c05 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -194,6 +194,7 @@ public:
 #endif
     }
 
+    template <class T>
     class PermuteInvoker : public ParallelLoopBody
     {
     public:
@@ -229,7 +230,7 @@ public:
             size_t stripeStart = r.start*stripeSize;
             size_t stripeEnd = std::min(r.end*stripeSize, orows);
 
-            const size_t esz = sizeof(float);
+            const size_t esz = sizeof(T);
             size_t ostep0 = out->step[0]/esz, ostep1 = out->step[1]/esz, ostep2 = out->step[2]/esz;
             const size_t* ord = &order->at(0);
             size_t istep0 = inp->step[ord[0]]/esz, istep1 = inp->step[ord[1]]/esz,
@@ -241,13 +242,13 @@ public:
             int i1 = (int)(val % n1);
             int i0 = (int)(val / n1);
 
-            const float* inptr_orig = inp->ptr<float>();
-            float* outptr_orig = out->ptr<float>();
+            const T* inptr_orig = inp->ptr<T>();
+            T* outptr_orig = out->ptr<T>();
 
             for( size_t ofs = stripeStart; ofs < stripeEnd; ofs++ )
             {
-                const float* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2;
-                float* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2;
+                const T* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2;
+                T* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2;
 
                 for( int i3 = 0; i3 < n3; i3++ )
                     outptr[i3] = inptr[i3*istep3];
@@ -321,7 +322,8 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   inputs_arr.depth() != CV_8S,
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
         if (inputs_arr.depth() == CV_16S)
@@ -365,24 +367,48 @@ public:
                 if( numAxes == 4 )
                 {
                     int nstripes = getNumThreads();
-                    PermuteInvoker::run(inp, out, _order, nstripes);
+                    if (inp.type() == CV_8S)
+                        PermuteInvoker<int8_t>::run(inp, out, _order, nstripes);
+                    else
+                        PermuteInvoker<float>::run(inp, out, _order, nstripes);
                 }
                 else
                 {
-                    const float *srcData = inp.ptr<float>();
-                    float *dstData = out.ptr<float>();
+                    if (inp.type() == CV_8S)
+                    {
+                        const int8_t *srcData = inp.ptr<int8_t>();
+                        int8_t *dstData = out.ptr<int8_t>();
 
-                    for (i = 0; i < count; ++i)
+                        for (i = 0; i < count; ++i)
+                        {
+                            size_t oldPosition = 0;
+                            size_t newPosition = i;
+
+                            for (j = 0; j < numAxes; ++j)
+                            {
+                                oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
+                                newPosition %= newStride[j];
+                            }
+                            dstData[i] = srcData[oldPosition];
+                        }
+                    }
+                    else
                     {
-                        size_t oldPosition = 0;
-                        size_t newPosition = i;
+                        const float *srcData = inp.ptr<float>();
+                        float *dstData = out.ptr<float>();
 
-                        for (j = 0; j < numAxes; ++j)
+                        for (i = 0; i < count; ++i)
                         {
-                            oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
-                            newPosition %= newStride[j];
+                            size_t oldPosition = 0;
+                            size_t newPosition = i;
+
+                            for (j = 0; j < numAxes; ++j)
+                            {
+                                oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
+                                newPosition %= newStride[j];
+                            }
+                            dstData[i] = srcData[oldPosition];
                         }
-                        dstData[i] = srcData[oldPosition];
                     }
                 }
             }
@@ -436,6 +462,11 @@ public:
     }
 #endif // HAVE_VULKAN
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 
     size_t _count;
     std::vector<size_t> _order;
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index b8e2cfdf8f..7653e53668 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -1327,6 +1327,23 @@ public:
         return true;
     }
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (type == MAX && !computeMaxIdx)
+        {
+            return true;
+        }
+        else if (type == AVE || type == SUM)
+        {
+            float multiplier = scales[0][0] / scales[1][0];
+            params.set("multiplier", multiplier);
+            params.set("input_zeropoint", zeropoints[0][0]);
+            return true;
+        }
+        return false;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp
index da1c61adac..797df4819d 100644
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@@ -231,6 +231,11 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index ab8f41c7b6..4c10d155c8 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -343,6 +343,11 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 
 private:
     std::vector<MatShape> outShapes;
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index a5c268214e..001db24a2d 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -344,6 +344,14 @@ public:
         shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
     }
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
+        params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+        return true;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/shuffle_channel_layer.cpp b/modules/dnn/src/layers/shuffle_channel_layer.cpp
index 6db74d1abd..2a698d270f 100644
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
@@ -147,6 +147,12 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+
 private:
     Ptr<PermuteLayer> permute;
     std::vector<int> permuteInpShape, permuteOutShape;
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 54e2340387..9efd95cf48 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -531,7 +531,12 @@ public:
             {
                 std::vector<int> inpIdx(dimsNum, 0);
                 std::vector<int> outIdx(dimsNum, 0);
-                getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+                if (inpMat.type() == CV_16S)
+                    getSliceRecursive<int16_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+                else if (inpMat.type() == CV_8S)
+                    getSliceRecursive<int8_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+                else
+                    getSliceRecursive<float>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
             }
         }
     }
@@ -647,8 +652,20 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        const int numOutputs = scales[1].size();
+        for (int i = 0; i < numOutputs; i++)
+        {
+            if (scales[1][i] != scales[0][0])
+             return false;
+        }
+        return true;
+    }
 
 private:
+    template <typename T>
     void getSliceRecursive(const Mat &inpMat, std::vector<int> &inpIdx,
                            const std::vector<Range> &sliceRanges,
                            const std::vector<int> &sliceSteps, int dim, int dimsNum,
@@ -658,8 +675,6 @@ private:
         int end = sliceRanges[dim].end;
         int step = !sliceSteps.empty() ? sliceSteps[dim] : 1;
 
-        const bool is32F = inpMat.depth() == CV_32F;
-
         // TODO optimization is required (for 2D tail case at least)
         for (int k = begin, j = 0; k < end; k += step, j++)
         {
@@ -667,14 +682,9 @@ private:
             outIdx[dim] = j;
 
             if (dim + 1 < dimsNum)
-                getSliceRecursive(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
+                getSliceRecursive<T>(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
             else
-            {
-                if (is32F)
-                    outputs.at<float>(outIdx.data()) = inpMat.at<float>(inpIdx.data());
-                else
-                    outputs.at<short>(outIdx.data()) = inpMat.at<short>(inpIdx.data());  // 16F emulation
-            }
+                outputs.at<T>(outIdx.data()) = inpMat.at<T>(inpIdx.data());
         }
     }
 
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 546c1017ad..e937e98f8c 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -374,6 +374,22 @@ public:
     }
 #endif  // HAVE_DNN_NGRAPH
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        float inpScale = scales[0][0];
+        Mat lookUpTable(1, 256, CV_32F);
+        float* table = lookUpTable.ptr<float>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - 127); // ensures exp(x) is always between (0, 1)
+            table[i+128] = std::exp(x);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPS(const std::vector<MatShape> &inputs,
                   const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/split_layer.cpp b/modules/dnn/src/layers/split_layer.cpp
index b025d5ff1e..2a44176152 100644
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@@ -117,6 +117,17 @@ public:
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        const int numOutputs = scales[1].size();
+        for (int i = 0; i < numOutputs; i++)
+        {
+            if (scales[1][i] != scales[0][0])
+             return false;
+        }
+        return true;
+    }
 };
 
 Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
new file mode 100644
index 0000000000..1fcb1d0dba
--- /dev/null
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -0,0 +1,1220 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include "npy_blob.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/dnn/all_layers.hpp>
+namespace opencv_test { namespace {
+
+template<typename TString>
+static std::string _tf(TString filename)
+{
+    return (getOpenCVExtraDir() + "dnn/") + filename;
+}
+
+class Test_Int8_layers : public DNNTestLayer
+{
+public:
+    void testLayer(const String& basename, const String& importer, double l1, double lInf,
+                   int numInps = 1, int numOuts = 1, bool useCaffeModel = false,
+                   bool useCommonInputBlob = true, bool hasText = false)
+    {
+        CV_Assert_N(numInps >= 1, numInps <= 10, numOuts >= 1, numOuts <= 10);
+        std::vector<Mat> inps(numInps), inps_int8(numInps);
+        std::vector<Mat> refs(numOuts), outs_int8(numOuts), outs_dequantized(numOuts);
+        std::vector<float> inputScale, outputScale;
+        std::vector<int> inputZp, outputZp;
+        String inpPath, outPath;
+        Net net, qnet;
+
+        if (importer == "Caffe")
+        {
+            String prototxt = _tf("layers/" + basename + ".prototxt");
+            String caffemodel = _tf("layers/" + basename + ".caffemodel");
+            net = readNetFromCaffe(prototxt, useCaffeModel ? caffemodel : String());
+
+            inpPath = _tf("layers/" + (useCommonInputBlob ? "blob" : basename + ".input"));
+            outPath =  _tf("layers/" + basename);
+        }
+        else if (importer == "TensorFlow")
+        {
+            String netPath = _tf("tensorflow/" + basename + "_net.pb");
+            String netConfig = hasText ? _tf("tensorflow/" + basename + "_net.pbtxt") : "";
+            net = readNetFromTensorflow(netPath, netConfig);
+
+            inpPath = _tf("tensorflow/" + basename + "_in");
+            outPath = _tf("tensorflow/" + basename + "_out");
+        }
+        else if (importer == "ONNX")
+        {
+            String onnxmodel = _tf("onnx/models/" + basename + ".onnx");
+            net = readNetFromONNX(onnxmodel);
+
+            inpPath = _tf("onnx/data/input_" + basename);
+            outPath = _tf("onnx/data/output_" + basename);
+        }
+        ASSERT_FALSE(net.empty());
+        net.setPreferableBackend(backend);
+        net.setPreferableTarget(target);
+
+        for (int i = 0; i < numInps; i++)
+            inps[i] = blobFromNPY(inpPath + ((numInps > 1) ? cv::format("_%d.npy", i) : ".npy"));
+
+        for (int i = 0; i < numOuts; i++)
+            refs[i] = blobFromNPY(outPath + ((numOuts > 1) ? cv::format("_%d.npy", i) : ".npy"));
+
+        qnet = net.quantize(inps, CV_8S, CV_8S);
+        qnet.getInputDetails(inputScale, inputZp);
+        qnet.getOutputDetails(outputScale, outputZp);
+
+        // Quantize inputs to int8
+        // int8_value = float_value/scale + zero-point
+        for (int i = 0; i < numInps; i++)
+        {
+            inps[i].convertTo(inps_int8[i], CV_8S, 1.f/inputScale[i], inputZp[i]);
+            String inp_name = numInps > 1 ? (importer == "Caffe" ? cv::format("input_%d", i) : cv::format("%d", i)) : "";
+            qnet.setInput(inps_int8[i], inp_name);
+        }
+        qnet.forward(outs_int8);
+
+        // Dequantize outputs and compare with reference outputs
+        // float_value = scale*(int8_value - zero-point)
+        for (int i = 0; i < numOuts; i++)
+        {
+            outs_int8[i].convertTo(outs_dequantized[i], CV_32F, outputScale[i], -(outputScale[i] * outputZp[i]));
+            normAssert(refs[i], outs_dequantized[i], "", l1, lInf);
+        }
+    }
+};
+
+TEST_P(Test_Int8_layers, Convolution1D)
+{
+    testLayer("conv1d", "ONNX", 0.00302, 0.00909);
+    testLayer("conv1d_bias", "ONNX", 0.00306, 0.00948);
+}
+
+TEST_P(Test_Int8_layers, Convolution2D)
+{
+    testLayer("layer_convolution", "Caffe", 0.0174, 0.0758, 1, 1, true);
+    testLayer("single_conv", "TensorFlow", 0.00413, 0.02201);
+    testLayer("depthwise_conv2d", "TensorFlow", 0.0388, 0.169);
+    testLayer("atrous_conv2d_valid", "TensorFlow", 0.0193, 0.0633);
+    testLayer("atrous_conv2d_same", "TensorFlow", 0.0185, 0.1322);
+    testLayer("keras_atrous_conv2d_same", "TensorFlow", 0.0056, 0.0244);
+    testLayer("convolution", "ONNX", 0.0052, 0.01516);
+    testLayer("two_convolution", "ONNX", 0.00295, 0.00840);
+}
+
+TEST_P(Test_Int8_layers, Convolution3D)
+{
+    testLayer("conv3d", "TensorFlow", 0.00734, 0.02434);
+    testLayer("conv3d", "ONNX", 0.00353, 0.00941);
+    testLayer("conv3d_bias", "ONNX", 0.00129, 0.00249);
+}
+
+TEST_P(Test_Int8_layers, Flatten)
+{
+    testLayer("flatten", "TensorFlow", 0.0036, 0.0069, 1, 1, false, true, true);
+    testLayer("unfused_flatten", "TensorFlow", 0.0014, 0.0028);
+    testLayer("unfused_flatten_unknown_batch", "TensorFlow", 0.0043, 0.0051);
+}
+
+TEST_P(Test_Int8_layers, Padding)
+{
+    testLayer("padding_valid", "TensorFlow", 0.0026, 0.0064);
+    testLayer("padding_same", "TensorFlow", 0.0081, 0.032);
+    testLayer("spatial_padding", "TensorFlow", 0.0078, 0.028);
+    testLayer("mirror_pad", "TensorFlow", 0.0064, 0.013);
+    testLayer("pad_and_concat", "TensorFlow", 0.0021, 0.0098);
+    testLayer("padding", "ONNX", 0.0005, 0.0069);
+    testLayer("ReflectionPad2d", "ONNX", 0.00062, 0.0018);
+    testLayer("ZeroPad2d", "ONNX", 0.00037, 0.0018);
+}
+
+TEST_P(Test_Int8_layers, AvePooling)
+{
+    testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
+    testLayer("ave_pool_same", "TensorFlow", 0.00153, 0.0041);
+    testLayer("average_pooling_1d", "ONNX", 0.002, 0.0048);
+    testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
+    testLayer("average_pooling_dynamic_axes", "ONNX", 0.0014, 0.006);
+
+    if (target != DNN_TARGET_CPU)
+        throw SkipTestException("Only CPU is supported");
+    testLayer("ave_pool3d", "TensorFlow", 0.00175, 0.0047);
+    testLayer("ave_pool3d", "ONNX", 0.00063, 0.0016);
+}
+
+TEST_P(Test_Int8_layers, MaxPooling)
+{
+    testLayer("pool_conv_1d", "ONNX", 0.0006, 0.0015);
+    if (target != DNN_TARGET_CPU)
+        throw SkipTestException("Only CPU is supported");
+    testLayer("pool_conv_3d", "ONNX", 0.0033, 0.0124);
+
+    /* All the below tests have MaxPooling as last layer, so computeMaxIdx is set to true
+       which is not supported by int8 maxpooling
+    testLayer("layer_pooling_max", "Caffe", 0.0021, 0.004);
+    testLayer("max_pool_even", "TensorFlow", 0.0048, 0.0139);
+    testLayer("max_pool_odd_valid", "TensorFlow", 0.0043, 0.012);
+    testLayer("conv_pool_nchw", "TensorFlow", 0.007, 0.025);
+    testLayer("max_pool3d", "TensorFlow", 0.0025, 0.0058);
+    testLayer("maxpooling_1d", "ONNX", 0.0018, 0.0037);
+    testLayer("two_maxpooling_1d", "ONNX", 0.0037, 0.0052);
+    testLayer("maxpooling", "ONNX", 0.0034, 0.0065);
+    testLayer("two_maxpooling", "ONNX", 0.0025, 0.0052);
+    testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);*/
+}
+
+TEST_P(Test_Int8_layers, Reduce)
+{
+    testLayer("reduce_mean", "TensorFlow", 0.0005, 0.0014);
+    testLayer("reduce_mean", "ONNX", 0.00062, 0.0014);
+    testLayer("reduce_mean_axis1", "ONNX", 0.00032, 0.0007);
+    testLayer("reduce_mean_axis2", "ONNX", 0.00033, 0.001);
+
+    testLayer("reduce_sum", "TensorFlow", 0.015, 0.031);
+    testLayer("reduce_sum_channel", "TensorFlow", 0.008, 0.019);
+    testLayer("sum_pool_by_axis", "TensorFlow", 0.012, 0.032);
+    testLayer("reduce_sum", "ONNX", 0.0025, 0.0048);
+
+    testLayer("reduce_max", "ONNX", 0, 0);
+    testLayer("reduce_max_axis_0", "ONNX", 0.0042, 0.007);
+    testLayer("reduce_max_axis_1", "ONNX", 0.0018, 0.0036);
+
+    if (target != DNN_TARGET_CPU)
+        throw SkipTestException("Only CPU is supported");
+    testLayer("reduce_mean3d", "ONNX", 0.00048, 0.0016);
+}
+
+TEST_P(Test_Int8_layers, ReLU)
+{
+    testLayer("layer_relu", "Caffe", 0.0005, 0.002);
+    testLayer("ReLU", "ONNX", 0.0012, 0.0047);
+}
+
+TEST_P(Test_Int8_layers, LeakyReLU)
+{
+    testLayer("leaky_relu", "TensorFlow", 0.0002, 0.0004);
+}
+
+TEST_P(Test_Int8_layers, ReLU6)
+{
+    testLayer("keras_relu6", "TensorFlow", 0.0018, 0.0062);
+    testLayer("keras_relu6", "TensorFlow", 0.0018, 0.0062, 1, 1, false, true, true);
+    testLayer("clip_by_value", "TensorFlow", 0.0009, 0.002);
+    testLayer("clip", "ONNX", 0.00006, 0.00037);
+}
+
+TEST_P(Test_Int8_layers, Sigmoid)
+{
+    testLayer("maxpooling_sigmoid", "ONNX", 0.0011, 0.0032);
+    testLayer("maxpooling_sigmoid_dynamic_axes", "ONNX", 0.0011, 0.0032);
+    testLayer("maxpooling_sigmoid_1d", "ONNX", 0.0011, 0.0037);
+}
+
+TEST_P(Test_Int8_layers, Mish)
+{
+    testLayer("mish", "ONNX", 0.0015, 0.0025);
+}
+
+TEST_P(Test_Int8_layers, Softmax)
+{
+    testLayer("layer_softmax", "Caffe", 0.0011, 0.0036);
+    testLayer("keras_softmax", "TensorFlow", 0.00093, 0.0027);
+    testLayer("slim_softmax", "TensorFlow", 0.0016, 0.0034);
+    testLayer("slim_softmax_v2", "TensorFlow", 0.0029, 0.017);
+    testLayer("softmax", "ONNX", 0.0016, 0.0028);
+    testLayer("log_softmax", "ONNX", 0.014, 0.025);
+    testLayer("softmax_unfused", "ONNX", 0.0009, 0.0021);
+}
+
+TEST_P(Test_Int8_layers, Concat)
+{
+    testLayer("layer_concat_shared_input", "Caffe", 0.0076, 0.029, 1, 1, true, false);
+    testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
+    testLayer("keras_pad_concat", "TensorFlow", 0.0032, 0.0089);
+    testLayer("concat_3d", "TensorFlow", 0.005, 0.014);
+    testLayer("concatenation", "ONNX", 0.0032, 0.009);
+}
+
+TEST_P(Test_Int8_layers, BatchNorm)
+{
+    testLayer("layer_batch_norm", "Caffe", 0.0061, 0.019, 1, 1, true);
+    testLayer("fused_batch_norm", "TensorFlow", 0.0063, 0.02);
+    testLayer("batch_norm_text", "TensorFlow", 0.0048, 0.013, 1, 1, false, true, true);
+    testLayer("unfused_batch_norm", "TensorFlow", 0.0076, 0.019);
+    testLayer("fused_batch_norm_no_gamma", "TensorFlow", 0.0067, 0.015);
+    testLayer("unfused_batch_norm_no_gamma", "TensorFlow", 0.0123, 0.044);
+    testLayer("switch_identity", "TensorFlow", 0.0035, 0.011);
+    testLayer("batch_norm3d", "TensorFlow", 0.0077, 0.02);
+    testLayer("batch_norm", "ONNX", 0.0012, 0.0049);
+    testLayer("batch_norm_3d", "ONNX", 0.0039, 0.012);
+    testLayer("frozenBatchNorm2d", "ONNX", 0.001, 0.0018);
+    testLayer("batch_norm_subgraph", "ONNX", 0.0049, 0.0098);
+}
+
+TEST_P(Test_Int8_layers, Scale)
+{
+    testLayer("batch_norm", "TensorFlow", 0.0028, 0.0098);
+    testLayer("scale", "ONNX", 0.0025, 0.0071);
+    testLayer("expand_hw", "ONNX", 0.0012, 0.0012);
+    testLayer("flatten_const", "ONNX", 0.0024, 0.0048);
+}
+
+TEST_P(Test_Int8_layers, InnerProduct)
+{
+    testLayer("layer_inner_product", "Caffe", 0.005, 0.02, 1, 1, true);
+    testLayer("matmul", "TensorFlow", 0.0061, 0.019);
+    testLayer("nhwc_transpose_reshape_matmul", "TensorFlow", 0.0009, 0.0091);
+    testLayer("nhwc_reshape_matmul", "TensorFlow", 0.03, 0.071);
+    testLayer("matmul_layout", "TensorFlow", 0.035, 0.06);
+    testLayer("tf2_dense", "TensorFlow", 0, 0);
+    testLayer("matmul_add", "ONNX", 0.041, 0.082);
+    testLayer("linear", "ONNX", 0.0018, 0.0029);
+    testLayer("constant", "ONNX", 0.00021, 0.0006);
+    testLayer("lin_with_constant", "ONNX", 0.0011, 0.0016);
+}
+
+TEST_P(Test_Int8_layers, Reshape)
+{
+    testLayer("reshape_layer", "TensorFlow", 0.0032, 0.0082);
+    testLayer("reshape_nchw", "TensorFlow", 0.0089, 0.029);
+    testLayer("reshape_conv", "TensorFlow", 0.035, 0.054);
+    testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
+    testLayer("reshape_as_shape", "TensorFlow", 0.0014, 0.0028);
+    testLayer("reshape_no_reorder", "TensorFlow", 0.0014, 0.0028);
+    testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, 0.014);
+    testLayer("dynamic_reshape", "ONNX", 0.0047, 0.0079);
+    testLayer("dynamic_reshape_opset_11", "ONNX", 0.0048, 0.0081);
+    testLayer("flatten_by_prod", "ONNX", 0.0048, 0.0081);
+    testLayer("squeeze", "ONNX", 0.0048, 0.0081);
+    testLayer("unsqueeze", "ONNX", 0.0033, 0.0053);
+    testLayer("squeeze_and_conv_dynamic_axes", "ONNX", 0.0054, 0.0154);
+    testLayer("unsqueeze_and_conv_dynamic_axes", "ONNX", 0.0037, 0.0151);
+}
+
+TEST_P(Test_Int8_layers, Permute)
+{
+    testLayer("tf2_permute_nhwc_ncwh", "TensorFlow", 0.0028, 0.006);
+    testLayer("transpose", "ONNX", 0.0015, 0.0046);
+}
+
+TEST_P(Test_Int8_layers, Identity)
+{
+    testLayer("expand_batch", "ONNX", 0.0027, 0.0036);
+    testLayer("expand_channels", "ONNX", 0.0013, 0.0019);
+    testLayer("expand_neg_batch", "ONNX", 0.00071, 0.0019);
+}
+
+TEST_P(Test_Int8_layers, Slice)
+{
+    testLayer("split", "TensorFlow", 0.0033, 0.0056);
+    testLayer("slice_4d", "TensorFlow", 0.003, 0.0073);
+    testLayer("strided_slice", "TensorFlow", 0.008, 0.0142);
+    testLayer("slice", "ONNX", 0.0046, 0.0077);
+    testLayer("slice_dynamic_axes", "ONNX", 0.0039, 0.0084);
+    testLayer("slice_opset_11_steps_2d", "ONNX", 0.0052, 0.0124);
+    testLayer("slice_opset_11_steps_3d", "ONNX", 0.0068, 0.014);
+    testLayer("slice_opset_11_steps_4d", "ONNX", 0.0041, 0.008);
+    testLayer("slice_opset_11_steps_5d", "ONNX", 0.0085, 0.021);
+}
+
+TEST_P(Test_Int8_layers, Dropout)
+{
+    testLayer("layer_dropout", "Caffe", 0.0021, 0.004);
+    testLayer("dropout", "ONNX", 0.0029, 0.004);
+}
+
+TEST_P(Test_Int8_layers, Eltwise)
+{
+    testLayer("layer_eltwise", "Caffe", 0.062, 0.15);
+    testLayer("conv_2_inps", "Caffe", 0.0086, 0.0232, 2, 1, true, false);
+    testLayer("eltwise_sub", "TensorFlow", 0.015, 0.047);
+    testLayer("eltwise_add_vec", "TensorFlow", 0.037, 0.21); // tflite 0.0095, 0.0365
+    testLayer("eltwise_mul_vec", "TensorFlow", 0.173, 1.14); // tflite 0.0028, 0.017
+    testLayer("channel_broadcast", "TensorFlow", 0.0025, 0.0063);
+    testLayer("split_equals", "TensorFlow", 0.02, 0.065);
+    testLayer("mul", "ONNX", 0.0039, 0.014);
+    testLayer("split_max", "ONNX", 0.004, 0.012);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_layers, dnnBackendsAndTargets());
+
+class Test_Int8_nets : public DNNTestLayer
+{
+public:
+    void testClassificationNet(Net baseNet, const Mat& blob, const Mat& ref, double l1, double lInf)
+    {
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
+
+        qnet.setInput(blob);
+        Mat out = qnet.forward();
+        normAssert(ref, out, "", l1, lInf);
+    }
+
+    void testDetectionNet(Net baseNet, const Mat& blob, const Mat& ref,
+                          double confThreshold, double scoreDiff, double iouDiff)
+    {
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
+
+        qnet.setInput(blob);
+        Mat out = qnet.forward();
+        normAssertDetections(ref, out, "", confThreshold, scoreDiff, iouDiff);
+    }
+
+    void testFaster(Net baseNet, const Mat& ref, double confThreshold, double scoreDiff, double iouDiff)
+    {
+        Mat inp = imread(_tf("dog416.png"));
+        resize(inp, inp, Size(800, 600));
+        Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
+        Mat imInfo = (Mat_<float>(1, 3) << inp.rows, inp.cols, 1.6f);
+
+        Net qnet = baseNet.quantize(std::vector<Mat>{blob, imInfo}, CV_32F, CV_32F);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
+
+        qnet.setInput(blob, "data");
+        qnet.setInput(imInfo, "im_info");
+        Mat out = qnet.forward();
+        normAssertDetections(ref, out, "", confThreshold, scoreDiff, iouDiff);
+    }
+
+    void testONNXNet(const String& basename, double l1, double lInf, bool useSoftmax = false)
+    {
+        String onnxmodel = findDataFile("dnn/onnx/models/" + basename + ".onnx", false);
+
+        Mat blob = readTensorFromONNX(findDataFile("dnn/onnx/data/input_" + basename + ".pb"));
+        Mat ref = readTensorFromONNX(findDataFile("dnn/onnx/data/output_" + basename + ".pb"));
+        Net baseNet = readNetFromONNX(onnxmodel);
+        baseNet.setPreferableBackend(backend);
+        baseNet.setPreferableTarget(target);
+
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        qnet.setInput(blob);
+        Mat out = qnet.forward();
+
+        if (useSoftmax)
+        {
+            LayerParams lp;
+            Net netSoftmax;
+            netSoftmax.addLayerToPrev("softmaxLayer", "Softmax", lp);
+            netSoftmax.setPreferableBackend(DNN_BACKEND_OPENCV);
+
+            netSoftmax.setInput(out);
+            out = netSoftmax.forward();
+
+            netSoftmax.setInput(ref);
+            ref = netSoftmax.forward();
+        }
+
+        normAssert(ref, out, "", l1, lInf);
+    }
+
+    void testDarknetModel(const std::string& cfg, const std::string& weights,
+                          const cv::Mat& ref, double scoreDiff, double iouDiff,
+                          float confThreshold = 0.24, float nmsThreshold = 0.4)
+    {
+        CV_Assert(ref.cols == 7);
+        std::vector<std::vector<int> > refClassIds;
+        std::vector<std::vector<float> > refScores;
+        std::vector<std::vector<Rect2d> > refBoxes;
+        for (int i = 0; i < ref.rows; ++i)
+        {
+            int batchId = static_cast<int>(ref.at<float>(i, 0));
+            int classId = static_cast<int>(ref.at<float>(i, 1));
+            float score = ref.at<float>(i, 2);
+            float left  = ref.at<float>(i, 3);
+            float top   = ref.at<float>(i, 4);
+            float right  = ref.at<float>(i, 5);
+            float bottom = ref.at<float>(i, 6);
+            Rect2d box(left, top, right - left, bottom - top);
+            if (batchId >= refClassIds.size())
+            {
+                refClassIds.resize(batchId + 1);
+                refScores.resize(batchId + 1);
+                refBoxes.resize(batchId + 1);
+            }
+            refClassIds[batchId].push_back(classId);
+            refScores[batchId].push_back(score);
+            refBoxes[batchId].push_back(box);
+        }
+
+        Mat img1 = imread(_tf("dog416.png"));
+        Mat img2 = imread(_tf("street.png"));
+        std::vector<Mat> samples(2);
+        samples[0] = img1; samples[1] = img2;
+
+        // determine test type, whether batch or single img
+        int batch_size = refClassIds.size();
+        CV_Assert(batch_size == 1 || batch_size == 2);
+        samples.resize(batch_size);
+
+        Mat inp = blobFromImages(samples, 1.0/255, Size(416, 416), Scalar(), true, false);
+
+        Net baseNet = readNetFromDarknet(findDataFile("dnn/" + cfg), findDataFile("dnn/" + weights, false));
+        Net qnet = baseNet.quantize(inp, CV_32F, CV_32F);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
+        qnet.setInput(inp);
+        std::vector<Mat> outs;
+        qnet.forward(outs, qnet.getUnconnectedOutLayersNames());
+
+        for (int b = 0; b < batch_size; ++b)
+        {
+            std::vector<int> classIds;
+            std::vector<float> confidences;
+            std::vector<Rect2d> boxes;
+            for (int i = 0; i < outs.size(); ++i)
+            {
+                Mat out;
+                if (batch_size > 1){
+                    // get the sample slice from 3D matrix (batch, box, classes+5)
+                    Range ranges[3] = {Range(b, b+1), Range::all(), Range::all()};
+                    out = outs[i](ranges).reshape(1, outs[i].size[1]);
+                }else{
+                    out = outs[i];
+                }
+                for (int j = 0; j < out.rows; ++j)
+                {
+                    Mat scores = out.row(j).colRange(5, out.cols);
+                    double confidence;
+                    Point maxLoc;
+                    minMaxLoc(scores, 0, &confidence, 0, &maxLoc);
+
+                    if (confidence > confThreshold) {
+                        float* detection = out.ptr<float>(j);
+                        double centerX = detection[0];
+                        double centerY = detection[1];
+                        double width = detection[2];
+                        double height = detection[3];
+                        boxes.push_back(Rect2d(centerX - 0.5 * width, centerY - 0.5 * height,
+                                            width, height));
+                        confidences.push_back(confidence);
+                        classIds.push_back(maxLoc.x);
+                    }
+                }
+            }
+
+            // here we need NMS of boxes
+            std::vector<int> indices;
+            NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
+
+            std::vector<int> nms_classIds;
+            std::vector<float> nms_confidences;
+            std::vector<Rect2d> nms_boxes;
+
+            for (size_t i = 0; i < indices.size(); ++i)
+            {
+                int idx = indices[i];
+                Rect2d box = boxes[idx];
+                float conf = confidences[idx];
+                int class_id = classIds[idx];
+                nms_boxes.push_back(box);
+                nms_confidences.push_back(conf);
+                nms_classIds.push_back(class_id);
+            }
+
+            if (cvIsNaN(iouDiff))
+            {
+                if (b == 0)
+                    std::cout << "Skip accuracy checks" << std::endl;
+                continue;
+            }
+
+            normAssertDetections(refClassIds[b], refScores[b], refBoxes[b], nms_classIds, nms_confidences, nms_boxes,
+                                 format("batch size %d, sample %d\n", batch_size, b).c_str(), confThreshold, scoreDiff, iouDiff);
+        }
+    }
+};
+
+TEST_P(Test_Int8_nets, AlexNet)
+{
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+    if (backend != DNN_BACKEND_OPENCV)
+        throw SkipTestException("Only OpenCV backend is supported");
+
+    Net net = readNetFromCaffe(findDataFile("dnn/bvlc_alexnet.prototxt"),
+                               findDataFile("dnn/bvlc_alexnet.caffemodel", false));
+
+    Mat inp = imread(_tf("grace_hopper_227.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(227, 227), Scalar(), false);
+    Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
+
+    float l1 = 1e-4, lInf = 0.003;
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, GoogLeNet)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
+                               findDataFile("dnn/bvlc_googlenet.caffemodel", false));
+
+    std::vector<Mat> inpMats;
+    inpMats.push_back( imread(_tf("googlenet_0.png")) );
+    inpMats.push_back( imread(_tf("googlenet_1.png")) );
+    Mat blob = blobFromImages(inpMats, 1.0, Size(224, 224), Scalar(), false);
+    Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
+
+    float l1 = 2e-4, lInf = 0.06;
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, ResNet50)
+{
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+    if (backend != DNN_BACKEND_OPENCV)
+        throw SkipTestException("Only OpenCV backend is supported");
+
+    Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt"),
+                               findDataFile("dnn/ResNet-50-model.caffemodel", false));
+
+    Mat inp = imread(_tf("googlenet_0.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(224, 224), Scalar(), false);
+    Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
+
+    float l1 = 3e-4, lInf = 0.035;
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, DenseNet121)
+{
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+
+    Net net = readNetFromCaffe(findDataFile("dnn/DenseNet_121.prototxt", false),
+                               findDataFile("dnn/DenseNet_121.caffemodel", false));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0 / 255.0, Size(224, 224), Scalar(), true, true);
+    Mat ref = blobFromNPY(_tf("densenet_121_output.npy"));
+
+    float l1 = 0.76, lInf = 3.31; // seems wrong
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, SqueezeNet_v1_1)
+{
+    if(target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt"),
+                               findDataFile("dnn/squeezenet_v1.1.caffemodel", false));
+
+    Mat inp = imread(_tf("googlenet_0.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(227, 227), Scalar(), false, true);
+    Mat ref = blobFromNPY(_tf("squeezenet_v1.1_prob.npy"));
+
+    float l1 = 3e-4, lInf = 0.056;
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, CaffeNet)
+{
+#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32))
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
+        && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+    float l1 = 4e-5, lInf = 0.0025;
+    testONNXNet("caffenet", l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, RCNN_ILSVRC13)
+{
+#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32))
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
+        && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+    float l1 = 0.02, lInf = 0.042;
+    testONNXNet("rcnn_ilsvrc13", l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, Inception_v2)
+{
+    testONNXNet("inception_v2",  default_l1,  default_lInf, true);
+}
+
+TEST_P(Test_Int8_nets, MobileNet_v2)
+{
+    testONNXNet("mobilenetv2", default_l1, default_lInf, true);
+}
+
+TEST_P(Test_Int8_nets, Shufflenet)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+    {
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    }
+    testONNXNet("shufflenet", default_l1, default_lInf);
+}
+
+TEST_P(Test_Int8_nets, MobileNet_SSD)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy.prototxt", false),
+                               findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false));
+
+    Mat inp = imread(_tf("street.png"));
+    Mat blob = blobFromImage(inp, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
+
+    float confThreshold = FLT_MIN, scoreDiff = 0.059, iouDiff = 0.11;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, MobileNet_v1_SSD)
+{
+    Net net = readNetFromTensorflow(findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", false),
+                                    findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false);
+    Mat ref = blobFromNPY(_tf("tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
+
+    float confThreshold = 0.5, scoreDiff = 0.034, iouDiff = 0.13;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, MobileNet_v1_SSD_PPN)
+{
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+        applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
+                     CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    Net net = readNetFromTensorflow(findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pb", false),
+                                    findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false);
+    Mat ref = blobFromNPY(_tf("tensorflow/ssd_mobilenet_v1_ppn_coco.detection_out.npy"));
+
+    float confThreshold = 0.51, scoreDiff = 0.04, iouDiff = 0.06;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, Inception_v2_SSD)
+{
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD &&
+        getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    Net net = readNetFromTensorflow(findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pb", false),
+                                    findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pbtxt"));
+
+    Mat inp = imread(_tf("street.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false);
+    Mat ref = (Mat_<float>(5, 7) << 0, 1, 0.90176028, 0.19872092, 0.36311883, 0.26461923, 0.63498729,
+                                    0, 3, 0.93569964, 0.64865261, 0.45906419, 0.80675775, 0.65708131,
+                                    0, 3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015,
+                                    0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
+                                    0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
+
+    float confThreshold = 0.5, scoreDiff = 0.0114, iouDiff = 0.22;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, opencv_face_detector)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/opencv_face_detector.prototxt"),
+                               findDataFile("dnn/opencv_face_detector.caffemodel", false));
+
+    Mat inp = imread(findDataFile("gpu/lbpcascade/er.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
+    Mat ref = (Mat_<float>(6, 7) << 0, 1, 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631,
+                                    0, 1, 0.9934696, 0.2831718, 0.50738752, 0.345781, 0.5985168,
+                                    0, 1, 0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290,
+                                    0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
+                                    0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
+                                    0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
+
+    float confThreshold = 0.5, scoreDiff = 0.002, iouDiff = 0.21;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, EfficientDet)
+{
+    if (target != DNN_TARGET_CPU)
+    {
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+    }
+    Net net = readNetFromTensorflow(findDataFile("dnn/efficientdet-d0.pb", false),
+                                    findDataFile("dnn/efficientdet-d0.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0/255, Size(512, 512), Scalar(123.675, 116.28, 103.53));
+    Mat ref = (Mat_<float>(3, 7) << 0, 1, 0.8437444, 0.153996080160141, 0.20534580945968628, 0.7463544607162476, 0.7414066195487976,
+                                    0, 17, 0.8245924, 0.16657517850399017, 0.3996818959712982, 0.4111558794975281, 0.9306337833404541,
+                                    0, 7, 0.8039304, 0.6118435263633728, 0.13175517320632935, 0.9065558314323425, 0.2943994700908661);
+
+    float confThreshold = 0.65, scoreDiff = 0.17, iouDiff = 0.18;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, FasterRCNN_resnet50)
+{
+    applyTestTag(
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+#ifdef INF_ENGINE_RELEASE
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 &&
+        (INF_ENGINE_VER_MAJOR_LT(2019020000) || target != DNN_TARGET_CPU))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+
+    if (INF_ENGINE_VER_MAJOR_GT(2019030000) &&
+        backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+
+    Net net = readNetFromTensorflow(findDataFile("dnn/faster_rcnn_resnet50_coco_2018_01_28.pb", false),
+                                    findDataFile("dnn/faster_rcnn_resnet50_coco_2018_01_28.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(800, 600), Scalar(), true, false);
+    Mat ref = blobFromNPY(_tf("tensorflow/faster_rcnn_resnet50_coco_2018_01_28.detection_out.npy"));
+
+    float confThreshold = 0.5, scoreDiff = 0.025, iouDiff = 0.15;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, FasterRCNN_inceptionv2)
+{
+    applyTestTag(
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+#ifdef INF_ENGINE_RELEASE
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 &&
+        (INF_ENGINE_VER_MAJOR_LT(2019020000) || target != DNN_TARGET_CPU))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+
+    if (INF_ENGINE_VER_MAJOR_GT(2019030000) &&
+        backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+
+    Net net = readNetFromTensorflow(findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", false),
+                                    findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(800, 600), Scalar(), true, false);
+    Mat ref = blobFromNPY(_tf("tensorflow/faster_rcnn_inception_v2_coco_2018_01_28.detection_out.npy"));
+
+    float confThreshold = 0.5, scoreDiff = 0.21, iouDiff = 0.1;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
+{
+    applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+#endif
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+#if defined(INF_ENGINE_RELEASE)
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+        applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+#endif
+
+    Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_vgg16.prototxt"),
+                               findDataFile("dnn/VGG16_faster_rcnn_final.caffemodel", false));
+
+    Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
+                                    0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
+                                    0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
+
+    float confThreshold = 0.8, scoreDiff = 0.024, iouDiff = 0.35;
+    testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, FasterRCNN_zf)
+{
+    applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
+#endif
+        CV_TEST_TAG_DEBUG_LONG
+    );
+
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+
+    if (target == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+
+    Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_zf.prototxt"),
+                               findDataFile("dnn/ZF_faster_rcnn_final.caffemodel", false));
+
+    Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
+                                    0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
+                                    0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
+
+    float confThreshold = 0.8, scoreDiff = 0.021, iouDiff = 0.1;
+    testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, RFCN)
+{
+    applyTestTag(
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+
+    Net net = readNetFromCaffe(findDataFile("dnn/rfcn_pascal_voc_resnet50.prototxt"),
+                               findDataFile("dnn/resnet50_rfcn_final.caffemodel", false));
+
+    Mat ref = (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
+                                    0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
+
+    float confThreshold = 0.8, scoreDiff = 0.017, iouDiff = 0.11;
+    testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, YoloVoc)
+{
+    applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
+        CV_TEST_TAG_MEMORY_1GB,
+#endif
+        CV_TEST_TAG_LONG
+    );
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+#endif
+#if defined(INF_ENGINE_RELEASE)
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) &&
+        target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+#endif
+
+    Mat ref = (Mat_<float>(6, 7) << 0, 6,  0.750469f, 0.577374f, 0.127391f, 0.902949f, 0.300809f,
+                                    0, 1,  0.780879f, 0.270762f, 0.264102f, 0.732475f, 0.745412f,
+                                    0, 11, 0.901615f, 0.1386f,   0.338509f, 0.421337f, 0.938789f,
+                                    1, 14, 0.623813f, 0.183179f, 0.381921f, 0.247726f, 0.625847f,
+                                    1, 6,  0.667770f, 0.446555f, 0.453578f, 0.499986f, 0.519167f,
+                                    1, 6,  0.844947f, 0.637058f, 0.460398f, 0.828508f, 0.66427f);
+
+    std::string config_file = "yolo-voc.cfg";
+    std::string weights_file = "yolo-voc.weights";
+
+    double scoreDiff = 0.1, iouDiff = 0.3;
+    {
+    SCOPED_TRACE("batch size 1");
+    testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff);
+    }
+
+    {
+    SCOPED_TRACE("batch size 2");
+    testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+    }
+}
+
+TEST_P(Test_Int8_nets, TinyYoloVoc)
+{
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+#if defined(INF_ENGINE_RELEASE)
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) &&
+        target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+#endif
+
+    Mat ref = (Mat_<float>(4, 7) << 0, 6,  0.761967f, 0.579042f, 0.159161f, 0.894482f, 0.31994f,
+                                    0, 11, 0.780595f, 0.129696f, 0.386467f, 0.445275f, 0.920994f,
+                                    1, 6,  0.651450f, 0.460526f, 0.458019f, 0.522527f, 0.5341f,
+                                    1, 6,  0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f);
+
+    std::string config_file = "tiny-yolo-voc.cfg";
+    std::string weights_file = "tiny-yolo-voc.weights";
+
+    double scoreDiff = 0.043, iouDiff = 0.12;
+    {
+    SCOPED_TRACE("batch size 1");
+    testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff);
+    }
+
+    {
+    SCOPED_TRACE("batch size 2");
+    testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+    }
+}
+
+TEST_P(Test_Int8_nets, YOLOv3)
+{
+    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    const int N0 = 3;
+    const int N1 = 6;
+    static const float ref_[/* (N0 + N1) * 7 */] = {
+0, 16, 0.998836f, 0.160024f, 0.389964f, 0.417885f, 0.943716f,
+0, 1, 0.987908f, 0.150913f, 0.221933f, 0.742255f, 0.746261f,
+0, 7, 0.952983f, 0.614621f, 0.150257f, 0.901368f, 0.289251f,
+
+1, 2, 0.997412f, 0.647584f, 0.459939f, 0.821037f, 0.663947f,
+1, 2, 0.989633f, 0.450719f, 0.463353f, 0.496306f, 0.522258f,
+1, 0, 0.980053f, 0.195856f, 0.378454f, 0.258626f, 0.629257f,
+1, 9, 0.785341f, 0.665503f, 0.373543f, 0.688893f, 0.439244f,
+1, 9, 0.733275f, 0.376029f, 0.315694f, 0.401776f, 0.395165f,
+1, 9, 0.384815f, 0.659824f, 0.372389f, 0.673927f, 0.429412f,
+    };
+    Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
+
+    std::string config_file = "yolov3.cfg";
+    std::string weights_file = "yolov3.weights";
+
+    double scoreDiff = 0.08, iouDiff = 0.21, confThreshold = 0.25;
+    {
+        SCOPED_TRACE("batch size 1");
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold);
+    }
+
+#if defined(INF_ENGINE_RELEASE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+    {
+        if (target == DNN_TARGET_OPENCL)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+        else if (target == DNN_TARGET_OPENCL_FP16 && INF_ENGINE_VER_MAJOR_LE(202010000))
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+        else if (target == DNN_TARGET_MYRIAD &&
+                 getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+    }
+#endif
+
+    {
+        SCOPED_TRACE("batch size 2");
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, confThreshold);
+    }
+}
+
+TEST_P(Test_Int8_nets, YOLOv4)
+{
+    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    const int N0 = 3;
+    const int N1 = 7;
+    static const float ref_[/* (N0 + N1) * 7 */] = {
+0, 16, 0.992194f, 0.172375f, 0.402458f, 0.403918f, 0.932801f,
+0, 1, 0.988326f, 0.166708f, 0.228236f, 0.737208f, 0.735803f,
+0, 7, 0.94639f, 0.602523f, 0.130399f, 0.901623f, 0.298452f,
+
+1, 2, 0.99761f, 0.646556f, 0.45985f, 0.816041f, 0.659067f,
+1, 0, 0.988913f, 0.201726f, 0.360282f, 0.266181f, 0.631728f,
+1, 2, 0.98233f, 0.452007f, 0.462217f, 0.495612f, 0.521687f,
+1, 9, 0.919195f, 0.374642f, 0.316524f, 0.398126f, 0.393714f,
+1, 9, 0.856303f, 0.666842f, 0.372215f, 0.685539f, 0.44141f,
+1, 9, 0.313516f, 0.656791f, 0.374734f, 0.671959f, 0.438371f,
+1, 9, 0.256625f, 0.940232f, 0.326931f, 0.967586f, 0.374002f,
+    };
+    Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
+
+    std::string config_file = "yolov4.cfg";
+    std::string weights_file = "yolov4.weights";
+    double scoreDiff = 0.1, iouDiff = 0.17;
+    {
+        SCOPED_TRACE("batch size 1");
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff);
+    }
+
+    {
+        SCOPED_TRACE("batch size 2");
+
+#if defined(INF_ENGINE_RELEASE)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        {
+            if (target == DNN_TARGET_OPENCL)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+            else if (target == DNN_TARGET_OPENCL_FP16 && INF_ENGINE_VER_MAJOR_LE(202010000))
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+            else if (target == DNN_TARGET_MYRIAD &&
+                     getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+        }
+#endif
+
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+    }
+}
+
+TEST_P(Test_Int8_nets, YOLOv4_tiny)
+{
+    applyTestTag(
+        target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB
+    );
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
+    if (target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    const float confThreshold = 0.6;
+
+    const int N0 = 2;
+    const int N1 = 3;
+    static const float ref_[/* (N0 + N1) * 7 */] = {
+0, 7, 0.85935f, 0.593484f, 0.141211f, 0.920356f, 0.291593f,
+0, 16, 0.795188f, 0.169207f, 0.386886f, 0.423753f, 0.933004f,
+
+1, 2, 0.996832f, 0.653802f, 0.464573f, 0.815193f, 0.653292f,
+1, 2, 0.963325f, 0.451151f, 0.458915f, 0.496255f, 0.52241f,
+1, 0, 0.926244f, 0.194851f, 0.361743f, 0.260277f, 0.632364f,
+    };
+    Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
+
+    std::string config_file = "yolov4-tiny.cfg";
+    std::string weights_file = "yolov4-tiny.weights";
+    double scoreDiff = 0.12;
+    double iouDiff = target == DNN_TARGET_OPENCL_FP16 ? 0.2 : 0.082;
+
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD)  // bad accuracy
+        iouDiff = std::numeric_limits<double>::quiet_NaN();
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL)
+        iouDiff = std::numeric_limits<double>::quiet_NaN();
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
+        iouDiff = std::numeric_limits<double>::quiet_NaN();
+#endif
+
+    {
+        SCOPED_TRACE("batch size 1");
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold);
+    }
+
+    /* bad accuracy on second image
+    {
+        SCOPED_TRACE("batch size 2");
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, confThreshold);
+    }
+    */
+
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD)  // bad accuracy
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_nets, dnnBackendsAndTargets());
+}} // namespace