int8 layers and 8-bit quantization support

pull/20228/head
SamFC10 3 years ago
parent 95919051e0
commit fa90e14b06
  1. 1
      modules/dnn/CMakeLists.txt
  2. 87
      modules/dnn/include/opencv2/dnn/all_layers.hpp
  3. 46
      modules/dnn/include/opencv2/dnn/dnn.hpp
  4. 383
      modules/dnn/src/dnn.cpp
  5. 38
      modules/dnn/src/init.cpp
  6. 178
      modules/dnn/src/int8layers/batch_norm_layer.cpp
  7. 1136
      modules/dnn/src/int8layers/convolution_layer.cpp
  8. 190
      modules/dnn/src/int8layers/elementwise_layers.cpp
  9. 577
      modules/dnn/src/int8layers/eltwise_layer.cpp
  10. 266
      modules/dnn/src/int8layers/fully_connected_layer.cpp
  11. 41
      modules/dnn/src/int8layers/layers_common.hpp
  12. 637
      modules/dnn/src/int8layers/layers_common.simd.hpp
  13. 595
      modules/dnn/src/int8layers/pooling_layer.cpp
  14. 157
      modules/dnn/src/int8layers/quantize_dequantize_layer.cpp
  15. 211
      modules/dnn/src/int8layers/scale_layer.cpp
  16. 176
      modules/dnn/src/int8layers/softmax_layer.cpp
  17. 12
      modules/dnn/src/layers/batch_norm_layer.cpp
  18. 5
      modules/dnn/src/layers/blank_layer.cpp
  19. 32
      modules/dnn/src/layers/concat_layer.cpp
  20. 9
      modules/dnn/src/layers/const_layer.cpp
  21. 42
      modules/dnn/src/layers/convolution_layer.cpp
  22. 179
      modules/dnn/src/layers/elementwise_layers.cpp
  23. 31
      modules/dnn/src/layers/eltwise_layer.cpp
  24. 5
      modules/dnn/src/layers/flatten_layer.cpp
  25. 39
      modules/dnn/src/layers/fully_connected_layer.cpp
  26. 12
      modules/dnn/src/layers/padding_layer.cpp
  27. 63
      modules/dnn/src/layers/permute_layer.cpp
  28. 17
      modules/dnn/src/layers/pooling_layer.cpp
  29. 5
      modules/dnn/src/layers/reorg_layer.cpp
  30. 5
      modules/dnn/src/layers/reshape_layer.cpp
  31. 8
      modules/dnn/src/layers/scale_layer.cpp
  32. 6
      modules/dnn/src/layers/shuffle_channel_layer.cpp
  33. 30
      modules/dnn/src/layers/slice_layer.cpp
  34. 16
      modules/dnn/src/layers/softmax_layer.cpp
  35. 11
      modules/dnn/src/layers/split_layer.cpp
  36. 1220
      modules/dnn/test/test_int8_layers.cpp

@ -9,6 +9,7 @@ endif()
set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)

@ -258,6 +258,14 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
};
class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
{
public:
int input_zp, output_zp;
float output_sc;
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
};
class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer
{
public:
@ -300,6 +308,13 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<PoolingLayer> create(const LayerParams& params);
};
class CV_EXPORTS PoolingLayerInt8 : public PoolingLayer
{
public:
int input_zp, output_zp;
static Ptr<PoolingLayerInt8> create(const LayerParams& params);
};
class CV_EXPORTS SoftmaxLayer : public Layer
{
public:
@ -308,6 +323,14 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<SoftmaxLayer> create(const LayerParams& params);
};
class CV_EXPORTS SoftmaxLayerInt8 : public SoftmaxLayer
{
public:
float output_sc;
int output_zp;
static Ptr<SoftmaxLayerInt8> create(const LayerParams& params);
};
class CV_EXPORTS InnerProductLayer : public Layer
{
public:
@ -315,6 +338,13 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<InnerProductLayer> create(const LayerParams& params);
};
class CV_EXPORTS InnerProductLayerInt8 : public InnerProductLayer
{
public:
int output_zp;
static Ptr<InnerProductLayerInt8> create(const LayerParams& params);
};
class CV_EXPORTS MVNLayer : public Layer
{
public:
@ -341,6 +371,22 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<FlattenLayer> create(const LayerParams &params);
};
class CV_EXPORTS QuantizeLayer : public Layer
{
public:
float scale;
int zeropoint;
static Ptr<QuantizeLayer> create(const LayerParams &params);
};
class CV_EXPORTS DequantizeLayer : public Layer
{
public:
float scale;
int zeropoint;
static Ptr<DequantizeLayer> create(const LayerParams &params);
};
class CV_EXPORTS ConcatLayer : public Layer
{
public:
@ -352,6 +398,7 @@ CV__DNN_INLINE_NS_BEGIN
* Details: https://github.com/torch/nn/blob/master/doc/containers.md#depthconcat
*/
bool padding;
int paddingValue;
static Ptr<ConcatLayer> create(const LayerParams &params);
};
@ -459,7 +506,11 @@ CV__DNN_INLINE_NS_BEGIN
{
public:
virtual void forwardSlice(const float* src, float* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const = 0;
size_t outPlaneSize, int cn0, int cn1) const {};
virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const {};
virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
size_t outPlaneSize, int cn0, int cn1) const {};
};
class CV_EXPORTS ReLULayer : public ActivationLayer
@ -542,6 +593,12 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<ExpLayer> create(const LayerParams &params);
};
class CV_EXPORTS ActivationLayerInt8 : public ActivationLayer
{
public:
static Ptr<ActivationLayerInt8> create(const LayerParams &params);
};
/* Layers used in semantic segmentation */
class CV_EXPORTS CropLayer : public Layer
@ -563,6 +620,12 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<EltwiseLayer> create(const LayerParams &params);
};
class CV_EXPORTS EltwiseLayerInt8 : public Layer
{
public:
static Ptr<EltwiseLayerInt8> create(const LayerParams &params);
};
class CV_EXPORTS BatchNormLayer : public ActivationLayer
{
public:
@ -572,6 +635,14 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<BatchNormLayer> create(const LayerParams &params);
};
class CV_EXPORTS BatchNormLayerInt8 : public BatchNormLayer
{
public:
float input_sc, output_sc;
int input_zp, output_zp;
static Ptr<BatchNormLayerInt8> create(const LayerParams &params);
};
class CV_EXPORTS MaxUnpoolLayer : public Layer
{
public:
@ -591,12 +662,26 @@ CV__DNN_INLINE_NS_BEGIN
static Ptr<ScaleLayer> create(const LayerParams& params);
};
class CV_EXPORTS ScaleLayerInt8 : public ScaleLayer
{
public:
float output_sc;
int output_zp;
static Ptr<ScaleLayerInt8> create(const LayerParams &params);
};
class CV_EXPORTS ShiftLayer : public Layer
{
public:
static Ptr<Layer> create(const LayerParams& params);
};
class CV_EXPORTS ShiftLayerInt8 : public Layer
{
public:
static Ptr<Layer> create(const LayerParams& params);
};
class CV_EXPORTS DataAugmentationLayer : public Layer
{
public:

@ -235,6 +235,15 @@ CV__DNN_INLINE_NS_BEGIN
*/
virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
/** @brief Tries to quantize the given layer and compute the quantization parameters required for fixed point implementation.
* @param[in] scales input and output scales.
* @param[in] zeropoints input and output zeropoints.
* @param[out] params Quantized parameters required for fixed point implementation of that layer.
* @returns True if layer can be quantized.
*/
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params);
/** @brief Given the @p input blobs, computes the output @p blobs.
* @param[in] inputs the input blobs.
* @param[out] outputs allocated output blobs, which will store results of the computation.
@ -368,6 +377,16 @@ CV__DNN_INLINE_NS_BEGIN
*/
virtual void getScaleShift(Mat& scale, Mat& shift) const;
/**
* @brief Returns scale and zeropoint of layers
* @param[out] scale Output scale
* @param[out] zeropoint Output zeropoint
*
* By default, @p scale is 1 and @p zeropoint is 0.
*/
virtual void getScaleZeropoint(float& scale, int& zeropoint) const;
/**
* @brief "Deattaches" all the layers, attached to particular layer.
*/
@ -453,13 +472,21 @@ CV__DNN_INLINE_NS_BEGIN
/** @brief Adds new layer to the net.
* @param name unique name of the adding layer.
* @param type typename of the adding layer (type must be registered in LayerRegister).
* @param dtype datatype of output blobs.
* @param params parameters which will be used to initialize the creating layer.
* @returns unique identifier of created layer, or -1 if a failure will happen.
*/
int addLayer(const String &name, const String &type, const int &dtype, LayerParams &params);
/** @overload Datatype of output blobs set to default CV_32F */
int addLayer(const String &name, const String &type, LayerParams &params);
/** @brief Adds new layer and connects its first input to the first output of previously added layer.
* @see addLayer()
*/
int addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params);
/** @overload */
int addLayerToPrev(const String &name, const String &type, LayerParams &params);
/** @brief Converts string name of the layer to the integer identifier.
@ -551,6 +578,25 @@ CV__DNN_INLINE_NS_BEGIN
CV_WRAP_AS(forwardAndRetrieve) void forward(CV_OUT std::vector<std::vector<Mat> >& outputBlobs,
const std::vector<String>& outBlobNames);
/** @brief Returns a quantized Net from a floating-point Net.
* @param calibData Calibration data to compute the quantization parameters.
* @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S.
* @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S.
*/
CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype);
/** @brief Returns input scale and zeropoint for a quantized Net.
* @param scales output parameter for returning input scales.
* @param zeropoints output parameter for returning input zeropoints.
*/
CV_WRAP void getInputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
/** @brief Returns output scale and zeropoint for a quantized Net.
* @param scales output parameter for returning output scales.
* @param zeropoints output parameter for returning output zeropoints.
*/
CV_WRAP void getOutputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
/**
* @brief Compile Halide layers.
* @param[in] scheduler Path to YAML file with scheduling directives.

@ -574,9 +574,9 @@ struct LayerPin
struct LayerData
{
LayerData() : id(-1), skip(false), flag(0) {}
LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
: id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
LayerData() : id(-1), dtype(CV_32F), skip(false), flag(0) {}
LayerData(int _id, const String &_name, const String &_type, const int &_dtype, LayerParams &_params)
: id(_id), name(_name), type(_type), dtype(_dtype), params(_params), skip(false), flag(0)
{
CV_TRACE_FUNCTION();
@ -588,6 +588,7 @@ struct LayerData
int id;
String name;
String type;
int dtype; // Datatype of output blobs.
LayerParams params;
std::vector<LayerPin> inputBlobsId;
@ -944,7 +945,7 @@ public:
}
}
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, const int& dtype)
{
if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
{
@ -966,7 +967,8 @@ public:
{
Mat& unusedBlob = hostIt->second;
if (unusedBlob.total() >= targetTotal &&
unusedBlob.total() < bestBlobTotal)
unusedBlob.total() < bestBlobTotal &&
unusedBlob.type() == dtype)
{
bestBlobPin = hostIt->first;
bestBlob = unusedBlob;
@ -985,14 +987,13 @@ public:
{
// if dst already has been allocated with total(shape) elements,
// it won't be recreated and pointer of dst.data remains the same.
dst.create(shape, use_half ? CV_16S : CV_32F);
dst.create(shape, dtype);
addHost(lp, dst);
}
}
void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
std::vector<LayerPin>& pinsForInternalBlobs,
bool use_half = false)
std::vector<LayerPin>& pinsForInternalBlobs)
{
CV_TRACE_FUNCTION();
@ -1063,7 +1064,7 @@ public:
reuse(ld.inputBlobsId[0], blobPin);
}
else
reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
reuseOrCreate(shapes[index], blobPin, *blobs[index], ld.dtype);
}
}
}
@ -1193,6 +1194,7 @@ struct Net::Impl : public detail::NetImplBase
lastLayerId = 0;
netWasAllocated = false;
netWasQuantized = false;
fusion = true;
isAsync = false;
preferableBackend = DNN_BACKEND_DEFAULT;
@ -1217,6 +1219,7 @@ struct Net::Impl : public detail::NetImplBase
int lastLayerId;
bool netWasAllocated;
bool netWasQuantized;
bool fusion;
bool isAsync;
std::vector<int64> layersTimings;
@ -1372,7 +1375,7 @@ struct Net::Impl : public detail::NetImplBase
currLayer->unsetAttached();
}
netWasAllocated = false;
layersTimings.clear();
}
@ -2541,10 +2544,11 @@ struct Net::Impl : public detail::NetImplBase
CV_Assert(layerShapesIt != layersShapes.end());
if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_OPENCL_FP16 && ld.dtype == CV_32F)
ld.dtype = CV_16S;
std::vector<LayerPin> pinsForInternalBlobs;
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
preferableBackend == DNN_BACKEND_OPENCV &&
preferableTarget == DNN_TARGET_OPENCL_FP16);
blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
for (int i = 0; i < ld.outputBlobs.size(); ++i)
ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
@ -3148,7 +3152,8 @@ struct Net::Impl : public detail::NetImplBase
Mat& inp = layers[0].outputBlobs[i];
CV_Assert(inp.total());
if (preferableBackend == DNN_BACKEND_OPENCV &&
preferableTarget == DNN_TARGET_OPENCL_FP16)
preferableTarget == DNN_TARGET_OPENCL_FP16 &&
layers[0].dtype == CV_32F)
{
layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
}
@ -3458,6 +3463,25 @@ struct Net::Impl : public detail::NetImplBase
#endif
}
void getQuantizationParams(const Mat& src, std::vector<float>& scales, std::vector<int>& zeropoints)
{
const int qmin = -128; // INT8_MIN
const int qmax = 127; // INT8_MAX
double rmin, rmax, sc, zp;
cv::minMaxIdx(src, &rmin, &rmax);
// 0 must be present in the range [rmin, rmax]
rmin = std::min(rmin, 0.0);
rmax = std::max(rmax, 0.0);
sc = (rmax == rmin) ? 1.0 : (rmax - rmin)/(qmax - qmin);
zp = qmin - (rmin/sc);
scales.push_back((float)sc);
zeropoints.push_back((int)std::round(zp));
}
void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
{
std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
@ -3588,7 +3612,8 @@ struct Net::Impl : public detail::NetImplBase
Mat& inp = layers[0].outputBlobs[i];
CV_Assert(inp.total());
if (preferableBackend == DNN_BACKEND_OPENCV &&
preferableTarget == DNN_TARGET_OPENCL_FP16)
preferableTarget == DNN_TARGET_OPENCL_FP16 &&
layers[0].dtype == CV_32F)
{
layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
}
@ -3614,7 +3639,7 @@ struct Net::Impl : public detail::NetImplBase
const MatShape& shape = layersShapes[inputLayerId].out[inputLayerIds[i].oid];
layersShapes[layerId].in.push_back(shape);
}
it->second.layerInstance->updateMemoryShapes(layersShapes[layerId].in);
it->second.getLayerInstance()->updateMemoryShapes(layersShapes[layerId].in);
}
}
}
@ -4019,7 +4044,7 @@ Net::~Net()
{
}
int Net::addLayer(const String &name, const String &type, LayerParams &params)
int Net::addLayer(const String &name, const String &type, const int &dtype, LayerParams &params)
{
CV_TRACE_FUNCTION();
@ -4042,23 +4067,35 @@ int Net::addLayer(const String &name, const String &type, LayerParams &params)
id = ++impl->lastLayerId;
impl->layerNameToId.insert(std::make_pair(name, id));
impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
impl->layers.insert(std::make_pair(id, LayerData(id, name, type, dtype, params)));
if (params.get<bool>("has_dynamic_shapes", false))
impl->hasDynamicShapes = true;
return id;
}
int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
int Net::addLayer(const String &name, const String &type, LayerParams &params)
{
CV_TRACE_FUNCTION();
return addLayer(name, type, CV_32F, params);
}
int Net::addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params)
{
CV_TRACE_FUNCTION();
int prvLid = impl->lastLayerId;
int newLid = this->addLayer(name, type, params);
int newLid = this->addLayer(name, type, dtype, params);
this->connect(prvLid, 0, newLid, 0);
return newLid;
}
int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
{
CV_TRACE_FUNCTION();
return addLayerToPrev(name, type, CV_32F, params);
}
void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
{
CV_TRACE_FUNCTION();
@ -4169,16 +4206,19 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
ld.outputBlobsWrappers[i]->copyToHost();
}
}
if (ld.outputBlobs[0].depth() == CV_32F)
if (ld.outputBlobs[0].depth() == CV_16S)
{
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec = ld.outputBlobs;
} else {
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec.resize(ld.outputBlobs.size());
for (int i = 0; i < outputvec.size(); i++)
convertFp16(ld.outputBlobs[i], outputvec[i]);
}
else
{
// Output depth can be CV_32F or CV_8S
std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
outputvec = ld.outputBlobs;
}
}
else if (outputBlobs.isUMatVector())
{
@ -4264,11 +4304,277 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
}
}
Net Net::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype)
{
CV_TRACE_FUNCTION();
// Net can be quantized only once.
if (impl->netWasQuantized)
CV_Error(Error::StsBadArg, "Cannot quantize a quantized net");
CV_CheckType(inputsDtype, inputsDtype == CV_32F || inputsDtype == CV_8S, "Input depth should be CV_32F or CV_8S");
CV_CheckType(outputsDtype, outputsDtype == CV_32F || outputsDtype == CV_8S, "Output depth should be CV_32F or CV_8S");
bool originalFusion = impl->fusion;
int prefBackend = impl->preferableBackend;
int prefTarget = impl->preferableTarget;
// Disable fusions and use CPU backend to quantize net
setPreferableBackend(DNN_BACKEND_OPENCV);
setPreferableTarget(DNN_TARGET_CPU);
enableFusion(false);
if (calibData.isMat())
{
setInput(calibData.getMat());
}
else if (calibData.isMatVector())
{
std::vector<Mat> calibDataVec;
calibData.getMatVector(calibDataVec);
std::vector<String> inpNames = impl->netInputLayer->outNames;
CV_CheckEQ(calibDataVec.size(), inpNames.size(), "Calibration data size should be equal to number of inputs");
for (int i = 0; i < calibDataVec.size(); i++)
setInput(calibDataVec[i], inpNames[i]);
}
std::vector<String> outNames = getUnconnectedOutLayersNames();
std::vector<LayerPin> pins;
for (int i = 0; i < outNames.size(); i++)
pins.push_back(impl->getPinByAlias(outNames[i]));
impl->setUpNet(pins);
// Compute scales and zeropoints for all the layers
std::vector<std::vector<float> > scales;
std::vector<std::vector<int> > zeropoints;
for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++)
{
LayerData& ld = it->second;
if (!ld.skip)
{
Ptr<Layer> layer = ld.layerInstance;
std::vector<Mat> inps(ld.inputBlobs.size());
for (int i = 0; i < ld.inputBlobs.size(); ++i)
inps[i] = *ld.inputBlobs[i];
layer->forward(inps, ld.outputBlobs, ld.internals);
}
std::vector<float> sc;
std::vector<int> zp;
if (ld.type == "TanH")
{
sc.push_back(1.f/128);
zp.push_back(0);
}
else if (ld.type == "Sigmoid" || ld.type == "Softmax" || ld.type == "SoftMax")
{
if (ld.params.get<bool>("log_softmax", false))
{
sc.push_back(16.f/256);
zp.push_back(127);
}
else
{
sc.push_back(1.f/256);
zp.push_back(-128);
}
}
else if (ld.type == "Split" || ld.type == "Slice" || ld.type == "Crop")
{
std::vector<float> inp_sc; std::vector<int> inp_zp;
impl->getQuantizationParams(*ld.inputBlobs[0], inp_sc, inp_zp);
sc.assign(ld.outputBlobs.size(), inp_sc[0]);
zp.assign(ld.outputBlobs.size(), inp_zp[0]);
}
else
{
for (int i = 0; i < ld.outputBlobs.size(); i++)
impl->getQuantizationParams(ld.outputBlobs[i], sc, zp);
}
scales.push_back(sc);
zeropoints.push_back(zp);
}
// For some layers, the input and output scales/zeropoints must be equal so that rescaling of inputs
// is not needed during quantized inference. We start from the last layer and modify the layer's input scales/zeropoints
// TODO : Need a different approach. Current solution fails when 2 such layers have the same input layer
for (Impl::MapIdToLayerData::reverse_iterator it = impl->layers.rbegin(); it != impl->layers.rend(); ++it)
{
LayerData& ld = it->second;
// Layers with multiple outputs. Number of outputs is equal to number of inputs
if (ld.type == "Blank" || ld.type == "Dropout" || ld.type == "Identity" || ld.type == "Silence" ||
ld.type == "Flatten" || ld.type == "Padding" || ld.type == "Permute" || ld.type == "Reshape" ||
ld.type == "ReLU6" || ld.type == "Reorg" || ld.type == "ShuffleChannel" ||
(ld.type == "ReLU" && !ld.params.get<float>("negative_slope", 0.f)) /* ReLU with negative slope 0 */)
{
for (int i = 0; i < ld.outputBlobs.size(); i++)
{
LayerPin &pin = ld.inputBlobsId[i];
scales[pin.lid][pin.oid] = scales[ld.id][i];
zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][i];
}
}
// Layers with multiple inputs and single output.
else if ((ld.type == "Pooling" && toLowerCase(ld.params.get<String>("pool", "max")) == "max") /* Max Pooling */ ||
(ld.type == "Eltwise" && toLowerCase(ld.params.get<String>("operation", "sum")) == "max") /* Elementwise max */ ||
ld.type == "Concat")
{
for (int i = 0; i < ld.inputBlobsId.size(); i++)
{
LayerPin &pin = ld.inputBlobsId[i];
scales[pin.lid][pin.oid] = scales[ld.id][0];
zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][0];
}
}
}
// Create a new Net and add quantized layers to it.
Net dstNet;
dstNet.impl->netWasQuantized = true;
dstNet.setInputsNames(impl->netInputLayer->outNames);
dstNet.setPreferableBackend(prefBackend);
dstNet.setPreferableTarget(prefTarget);
dstNet.enableFusion(originalFusion);
for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++)
{
LayerData ld = it->second;
if (ld.id == 0)
{
LayerData &quantInpLd = dstNet.impl->layers[0];
quantInpLd.dtype = inputsDtype;
quantInpLd.params.set("scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
quantInpLd.params.set("zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
continue;
}
std::vector<LayerPin> inpPins = ld.inputBlobsId;
// Fill input and output scales/zeropoints for the layer
std::vector<std::vector<float> > inp_out_sc(2);
std::vector<std::vector<int> > inp_out_zp(2);
for (int i = 0; i < inpPins.size(); i++)
{
LayerPin &pin = inpPins[i];
inp_out_sc[0].push_back(scales[pin.lid][pin.oid]);
inp_out_zp[0].push_back(zeropoints[pin.lid][pin.oid]);
}
inp_out_sc[1] = scales[ld.id];
inp_out_zp[1] = zeropoints[ld.id];
// Quantize layer
Ptr<Layer> layer = ld.layerInstance;
if (layer->tryQuantize(inp_out_sc, inp_out_zp, ld.params))
{
ld.type += "Int8";
ld.dtype = CV_8S;
}
ld.params.set("scales", DictValue::arrayReal(inp_out_sc[1].data(), inp_out_sc[1].size()));
ld.params.set("zeropoints", DictValue::arrayInt(inp_out_zp[1].data(), inp_out_zp[1].size()));
// Check and add quantize/dequantize node before layer
for (int i = 0; i < inpPins.size(); i++)
{
LayerPin &pin = inpPins[i];
LayerData &inpLd = dstNet.impl->getLayerData(impl->getLayerName(pin.lid));
pin.lid = inpLd.id;
if (inpLd.dtype != ld.dtype)
{
String layerName = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? cv::format("quantize/%s/%d", inpLd.name.c_str(), pin.oid)
: cv::format("dequantize/%s/%d", inpLd.name.c_str(), pin.oid);
// Check if quantize/dequantize node for the input layer already exists
if (dstNet.impl->getLayerId(layerName) >= 0)
{
pin.lid = dstNet.impl->getLayerId(layerName);
pin.oid = 0;
}
else
{
LayerParams lp;
lp.set("scales", inp_out_sc[0][i]);
lp.set("zeropoints", inp_out_zp[0][i]);
lp.name = layerName;
lp.type = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? "Quantize" : "Dequantize";
int newLid = dstNet.addLayer(lp.name, lp.type, ld.dtype, lp);
dstNet.connect(pin.lid, pin.oid, newLid, 0);
pin.lid = newLid; pin.oid = 0;
}
}
}
// Add quantized layer to Net and connect to its inputs.
int newLid = dstNet.addLayer(ld.name, ld.type, ld.dtype, ld.params);
for( int i = 0; i < inpPins.size(); i++ )
dstNet.connect(inpPins[i].lid, inpPins[i].oid, newLid, i);
// If the layer is a output layer, add quantize/dequantize node after it based on output's data type.
if (ld.requiredOutputs.size() == 0 && ld.dtype != outputsDtype)
{
LayerParams lp;
lp.set("scales", inp_out_sc[1][0]);
lp.set("zeropoints", inp_out_zp[1][0]);
lp.name = ((ld.dtype == CV_32F && outputsDtype == CV_8S) ? "quantize/" : "dequantize/") + ld.name;
lp.type = (ld.dtype == CV_32F && outputsDtype == CV_8S) ? "Quantize" : "Dequantize";
dstNet.addLayerToPrev(lp.name, lp.type, outputsDtype, lp);
}
}
// Restore FP32 Net's backend, target and fusion
setPreferableBackend(prefBackend);
setPreferableTarget(prefTarget);
enableFusion(originalFusion);
return dstNet;
}
void Net::getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
{
if (!impl->netWasQuantized)
CV_Error(Error::StsBadFunc, "Net isn't quantized");
LayerParams &lp = impl->layers[0].params;
DictValue sc = lp.get("scales");
DictValue zp = lp.get("zeropoints");
for (int i = 0; i < sc.size(); i++)
{
scales.push_back(sc.get<float>(i));
zeropoints.push_back(zp.get<int>(i));
}
}
void Net::getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
{
if (!impl->netWasQuantized)
CV_Error(Error::StsBadFunc, "Net isn't quantized");
std::vector<int> outLayerIds = getUnconnectedOutLayers();
for (auto &lid : outLayerIds)
{
LayerParams &lp = impl->layers[lid].params;
DictValue sc = lp.get("scales");
DictValue zp = lp.get("zeropoints");
for (int i = 0; i < sc.size(); i++)
{
scales.push_back(sc.get<float>(i));
zeropoints.push_back(zp.get<int>(i));
}
}
}
void Net::setPreferableBackend(int backendId)
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG(backendId);
if (backendId == DNN_BACKEND_DEFAULT)
backendId = (Backend)PARAM_DNN_BACKEND_DEFAULT;
if (impl->netWasQuantized && backendId != DNN_BACKEND_OPENCV)
{
CV_LOG_WARNING(NULL, "DNN: Only default backend supports quantized networks");
backendId = DNN_BACKEND_OPENCV;
}
#ifdef HAVE_INF_ENGINE
if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
backendId = getInferenceEngineBackendTypeParam();
@ -4277,7 +4583,6 @@ void Net::setPreferableBackend(int backendId)
if( impl->preferableBackend != backendId )
{
impl->preferableBackend = backendId;
impl->netWasAllocated = false;
impl->clear();
}
}
@ -4287,6 +4592,13 @@ void Net::setPreferableTarget(int targetId)
CV_TRACE_FUNCTION();
CV_TRACE_ARG(targetId);
if (impl->netWasQuantized && targetId != DNN_TARGET_CPU &&
targetId != DNN_TARGET_OPENCL && targetId != DNN_TARGET_OPENCL_FP16)
{
CV_LOG_WARNING(NULL, "DNN: Only CPU and OpenCL/OpenCL FP16 target is supported by quantized networks");
targetId = DNN_TARGET_CPU;
}
if( impl->preferableTarget != targetId )
{
impl->preferableTarget = targetId;
@ -4306,7 +4618,6 @@ void Net::setPreferableTarget(int targetId)
impl->preferableTarget = DNN_TARGET_OPENCL;
#endif
}
impl->netWasAllocated = false;
impl->clear();
}
}
@ -4935,9 +5246,10 @@ void Net::getMemoryConsumption(const int layerId,
ShapesVec inLayerShapes, outLayerShapes;
getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
size_t elemSize = (impl->netWasQuantized) ? sizeof(char) : sizeof(float);
for(int i = 0; i < outLayerShapes.size(); i++)
{
blobs += total(outLayerShapes[i]) * sizeof(float);
blobs += total(outLayerShapes[i]) * elemSize;
}
}
@ -4986,7 +5298,7 @@ void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
size_t elemSize = (impl->netWasQuantized) ? sizeof(char) : sizeof(float);
for(int i = 0; i < layerIds.size(); i++)
{
int w = 0, b = 0;
@ -5001,7 +5313,7 @@ void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
for(int j = 0; j < outLayerShapes[i].size(); j++)
{
b += total(outLayerShapes[i][j]) * sizeof(float);
b += total(outLayerShapes[i][j]) * elemSize;
}
weights.push_back(w);
@ -5021,7 +5333,6 @@ void Net::enableFusion(bool fusion)
if( impl->fusion != fusion )
{
impl->fusion = fusion;
impl->netWasAllocated = false;
impl->clear();
}
}
@ -5195,6 +5506,12 @@ void Layer::getScaleShift(Mat& scale, Mat& shift) const
shift = Mat();
}
void Layer::getScaleZeropoint(float& scale, int& zeropoint) const
{
scale = 1.f;
zeropoint = 0;
}
void Layer::unsetAttached()
{
setActivation(Ptr<ActivationLayer>());
@ -5321,6 +5638,12 @@ void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::
this->forward(inputs, outputs, internals);
}
bool Layer::tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
return false;
}
Layer::~Layer() {}
bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,

@ -141,6 +141,44 @@ void initializeLayerFactory()
CV_DNN_REGISTER_LAYER_CLASS(LSTM, LSTMLayer);
CV_DNN_REGISTER_LAYER_CLASS(GRU, GRULayer);
CV_DNN_REGISTER_LAYER_CLASS(CumSum, CumSumLayer);
CV_DNN_REGISTER_LAYER_CLASS(Quantize, QuantizeLayer);
CV_DNN_REGISTER_LAYER_CLASS(Dequantize, DequantizeLayer);
CV_DNN_REGISTER_LAYER_CLASS(ConvolutionInt8, ConvolutionLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(InnerProductInt8, InnerProductLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(PoolingInt8, PoolingLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(EltwiseInt8, EltwiseLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(BatchNormInt8, BatchNormLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(ScaleInt8, ScaleLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(ShiftInt8, ShiftLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(ReLUInt8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(ReLU6Int8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(SigmoidInt8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(TanHInt8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(SwishInt8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(MishInt8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(ELUInt8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(BNLLInt8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(AbsValInt8, ActivationLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(SoftmaxInt8, SoftmaxLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(SoftMaxInt8, SoftmaxLayerInt8);
CV_DNN_REGISTER_LAYER_CLASS(ConcatInt8, ConcatLayer);
CV_DNN_REGISTER_LAYER_CLASS(FlattenInt8, FlattenLayer);
CV_DNN_REGISTER_LAYER_CLASS(PaddingInt8, PaddingLayer);
CV_DNN_REGISTER_LAYER_CLASS(BlankInt8, BlankLayer);
CV_DNN_REGISTER_LAYER_CLASS(DropoutInt8, BlankLayer);
CV_DNN_REGISTER_LAYER_CLASS(IdentityInt8, BlankLayer);
CV_DNN_REGISTER_LAYER_CLASS(SilenceInt8, BlankLayer);
CV_DNN_REGISTER_LAYER_CLASS(ConstInt8, ConstLayer);
CV_DNN_REGISTER_LAYER_CLASS(ReshapeInt8, ReshapeLayer);
CV_DNN_REGISTER_LAYER_CLASS(SplitInt8, SplitLayer);
CV_DNN_REGISTER_LAYER_CLASS(SliceInt8, SliceLayer);
CV_DNN_REGISTER_LAYER_CLASS(CropInt8, CropLayer);
CV_DNN_REGISTER_LAYER_CLASS(PermuteInt8, PermuteLayer);
CV_DNN_REGISTER_LAYER_CLASS(ReorgInt8, ReorgLayer);
CV_DNN_REGISTER_LAYER_CLASS(ShuffleChannelInt8, ShuffleChannelLayer);
}
CV__DNN_INLINE_NS_END

@ -0,0 +1,178 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace cv
{
namespace dnn
{
class BatchNormLayerInt8Impl CV_FINAL : public BatchNormLayerInt8
{
public:
Mat origin_weights, origin_bias;
Mat weights_, bias_;
mutable int dims;
BatchNormLayerInt8Impl(const LayerParams& params)
: dims(-1)
{
setParamsFrom(params);
useGlobalStats = params.get<bool>("use_global_stats", true);
input_sc = params.get<float>("input_scale");
input_zp = params.get<int>("input_zeropoint");
output_sc = params.get<float>("scales");
output_zp = params.get<int>("zeropoints");
CV_Assert(blobs.size() == 2);
size_t n = blobs[0].total();
CV_Assert(blobs[1].total() == n &&
blobs[0].isContinuous() && blobs[1].isContinuous() &&
blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
origin_weights = blobs[0];
origin_bias = blobs[1];
}
virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
{
origin_weights.convertTo(weights_, CV_32F, input_sc/output_sc);
addWeighted(origin_bias, 1.0/output_sc, weights_, -input_zp, output_zp, bias_, CV_32F);
}
void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
{
scale = origin_weights;
shift = origin_bias;
}
void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE
{
scale = output_sc;
zeropoint = output_zp;
}
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
{
Mat w_, b_;
top->getScaleShift(w_, b_);
if (w_.empty() && b_.empty())
return false;
const int numChannels = weights_.total();
const int numFusedWeights = w_.total();
const int numFusedBias = b_.total();
if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w_.empty()) ||
(numFusedBias != numChannels && numFusedBias != 1 && !b_.empty()))
return false;
float new_sc;
int new_zp;
top->getScaleZeropoint(new_sc, new_zp);
Mat w = numFusedWeights == 1 ? Mat(1, numChannels, CV_32F, Scalar(w_.at<float>(0))) :
(w_.empty() ? Mat::ones(1, numChannels, CV_32F) : w_.reshape(1, 1));
Mat b = numFusedBias == 1 ? Mat(1, numChannels, CV_32F, Scalar(b_.at<float>(0))) :
(b_.empty() ? Mat::zeros(1, numChannels, CV_32F) : b_.reshape(1, 1));
weights_ = Mat(); bias_ = Mat();
multiply(origin_weights, w, weights_, input_sc/new_sc, CV_32F);
multiply(origin_bias, w, bias_);
add(bias_, b, bias_);
addWeighted(bias_, 1.0/new_sc, weights_, -input_zp, new_zp, bias_, CV_32F);
return true;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
dims = inputs[0].size();
if (!useGlobalStats && inputs[0][0] != 1)
CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1");
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
return true;
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
}
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
{
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
if (!activ_int8.empty())
{
return activ_int8->blobs.empty();
}
return false;
}
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
CV_Assert(blobs.size() == 2);
CV_Assert(inputs.size() == 1);
Mat &inpBlob = inputs[0];
int planeSize = 1;
for (size_t i = 2; i < inpBlob.dims; i++) {
planeSize *= inpBlob.size[i];
}
for (size_t ii = 0; ii < outputs.size(); ii++)
{
Mat &outBlob = outputs[ii];
for(int num = 0; num < outBlob.size[0]; num++)
{
for (int n = 0; n < outBlob.size[1]; n++)
{
float w = weights_.at<float>(n);
float b = bias_.at<float>(n);
Mat inpBlobPlane(1, planeSize, CV_8S, inpBlob.ptr<int8_t>(num, n));
Mat outBlobPlane(1, planeSize, CV_8S, outBlob.ptr<int8_t>(num, n));
inpBlobPlane.convertTo(outBlobPlane, CV_8S, w, b);
}
}
}
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
CV_UNUSED(outputs); // suppress unused variable warning
int64 flops = 0;
for(int i = 0; i < inputs.size(); i++)
{
flops += 3*total(inputs[i]);
}
return flops;
}
private:
bool useGlobalStats;
};
Ptr<BatchNormLayerInt8> BatchNormLayerInt8::create(const LayerParams& params)
{
return Ptr<BatchNormLayerInt8>(new BatchNormLayerInt8Impl(params));
}
} // namespace dnn
} // namespace cv

File diff suppressed because it is too large Load Diff

@ -0,0 +1,190 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <opencv2/dnn/shape_utils.hpp>
#include <iostream>
namespace cv
{
namespace dnn
{
class ActivationLayerInt8Impl CV_FINAL : public ActivationLayerInt8
{
public:
ActivationLayerInt8Impl(const LayerParams &params)
{
setParamsFrom(params);
activationLUT = !blobs.empty() ? blobs[0] : Mat();
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
return true;
}
class Activation : public cv::ParallelLoopBody
{
public:
const Mat* src;
const Mat* lut;
Mat* dst;
int nstripes;
Activation() : src(0), lut(0), dst(0), nstripes(0){}
static void run(const Mat& src, const Mat& lut, Mat& dst, int nstripes)
{
Activation p;
p.src = &src;
p.lut = &lut;
p.dst = &dst;
p.nstripes = nstripes;
parallel_for_(Range(0, nstripes), p, nstripes);
}
void operator()(const Range &r) const CV_OVERRIDE
{
const int8_t* table = lut->ptr<int8_t>();
int nsamples = 1, outCn = 1;
size_t planeSize = 1;
if (src->dims > 1)
{
nsamples = src->size[0];
outCn = src->size[1];
}
else
outCn = src->size[0];
for (int i = 2; i < src->dims; ++i)
planeSize *= src->size[i];
size_t stripeSize = (planeSize + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(r.end*stripeSize, planeSize);
int len = (int)(stripeEnd - stripeStart);
for( int i = 0; i < nsamples; i++ )
{
const int8_t* srcptr = src->ptr<int8_t>(i) + stripeStart;
int8_t* dstptr = dst->ptr<int8_t>(i) + stripeStart;
for( int cn = 0; cn < outCn; cn++, srcptr += planeSize, dstptr += planeSize )
{
int i = 0;
#if CV_SIMD128
for( ; i <= len - 16; i += 16 )
{
v_int8x16 out(table[srcptr[i] + 128], table[srcptr[i+1] + 128], table[srcptr[i+2] + 128], table[srcptr[i+3] + 128],
table[srcptr[i+4] + 128], table[srcptr[i+5] + 128], table[srcptr[i+6] + 128], table[srcptr[i+7] + 128],
table[srcptr[i+8] + 128], table[srcptr[i+9] + 128], table[srcptr[i+10] + 128], table[srcptr[i+11] + 128],
table[srcptr[i+12] + 128], table[srcptr[i+13] + 128], table[srcptr[i+14] + 128], table[srcptr[i+15] + 128]);
v_store(dstptr + i, out);
}
#endif
for( ; i < len; i++ )
{
dstptr[i] = table[srcptr[i] + 128];
}
}
}
}
};
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
for (size_t i = 0; i < inputs.size(); i++)
{
const Mat &src = inputs[i];
if (!activationLUT.empty())
{
const int nstripes = getNumThreads();
Mat &dst = outputs[i];
CV_Assert(src.size == dst.size && src.type() == dst.type() &&
src.isContinuous() && dst.isContinuous() && src.type() == CV_8S);
Activation::run(src, activationLUT, dst, nstripes);
}
else
{
src.copyTo(outputs[i]);
}
}
}
void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
{
for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize )
{
int i = 0;
#if CV_SIMD128
for( ; i <= len - 16; i += 16 )
{
v_int8x16 out(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128],
lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128],
lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128],
lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]);
v_store(dst + i, out);
}
#endif
for( ; i < len; i++ )
dst[i] = lut[src[i] + 128];
}
}
void forwardSlice(const int* src, const int* lut, int* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
{
for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize )
{
int i = 0;
#if CV_SIMD128
for( ; i <= len - 16; i += 16 )
{
v_int32x4 out0(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128]);
v_int32x4 out1(lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128]);
v_int32x4 out2(lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128]);
v_int32x4 out3(lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]);
v_store(dst + i, out0);
v_store(dst + i + 4, out1);
v_store(dst + i + 8, out2);
v_store(dst + i + 12, out3);
}
#endif
for( ; i < len; i++ )
dst[i] = lut[src[i] + 128];
}
}
Mat activationLUT;
};
Ptr<ActivationLayerInt8> ActivationLayerInt8::create(const LayerParams& params)
{
return Ptr<ActivationLayerInt8>(new ActivationLayerInt8Impl(params));
}
}
}

@ -0,0 +1,577 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace cv
{
namespace dnn
{
class EltwiseLayerInt8Impl CV_FINAL : public EltwiseLayerInt8
{
public:
enum EltwiseOp
{
PROD = 0,
SUM = 1,
MAX = 2
} op;
std::vector<float> coeffs;
std::vector<int> zeropoints;
enum OutputChannelsMode
{
ELTWISE_CHANNNELS_SAME = 0, //!< number of channels from inputs must be the same and equal to output's number of channels
ELTWISE_CHANNNELS_INPUT_0, //!< number of channels from inputs may be different,
//!< output's number of channels is equal to number of channels of first input
//!< number of channels of other inputs should not be greater than number of channels of first input
ELTWISE_CHANNNELS_INPUT_0_TRUNCATE, //!< number of channels from inputs may be different,
//!< output's number of channels is equal to number of channels of first input
//!< there is restriction on number of channels of other inputs
//!< extra channels of other inputs is ignored
ELTWISE_CHANNNELS_USE_MAX, //!< number of channels from inputs may be different,
//!< output's number of channels is equal to maximal number of input channels
//!< @note supported operation: `SUM`
} channelsModeInput;
mutable OutputChannelsMode channelsMode; //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
mutable /*size_t*/int outputChannels;
EltwiseLayerInt8Impl(const LayerParams& params)
: outputChannels(0)
{
setParamsFrom(params);
offset = params.get<float>("offset", 0.f);
hasVecInput = false;
op = SUM;
if (params.has("operation"))
{
String operation = toLowerCase(params.get<String>("operation"));
if (operation == "prod")
op = PROD;
else if (operation == "sum")
op = SUM;
else if (operation == "max")
op = MAX;
else
CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
}
if (params.has("coeff"))
{
DictValue paramCoeff = params.get("coeff");
int i, n = paramCoeff.size();
coeffs.resize(n);
for (i = 0; i < n; i++)
{
coeffs[i] = paramCoeff.get<float>(i);
}
}
if (params.has("input_zeropoints"))
{
DictValue zp = params.get("input_zeropoints");
int i, n = zp.size();
zeropoints.resize(n);
for (i = 0; i < n; i++)
{
zeropoints[i] = zp.get<int>(i);
}
}
channelsModeInput = ELTWISE_CHANNNELS_SAME;
if (params.has("output_channels_mode"))
{
String v = toLowerCase(params.get<String>("output_channels_mode"));
if (v == "same")
{
channelsModeInput = ELTWISE_CHANNNELS_SAME;
}
else if (v == "input_0")
{
channelsModeInput = ELTWISE_CHANNNELS_INPUT_0;
}
else if (v == "input_0_truncate")
{
channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE;
}
else if (v == "max_input_channels")
{
channelsModeInput = ELTWISE_CHANNNELS_USE_MAX;
if (op != SUM)
CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only");
}
else
CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\"");
}
channelsMode = channelsModeInput;
// TODO Must have checks for other unknown options
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
CV_Assert(inputs.size() >= 2);
CV_Assert(inputs[0].size() >= 2);
CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
CV_Assert(op == SUM || op == PROD || coeffs.size() == 0);
int dims = inputs[0].size();
// Number of channels in output shape is determined by the first input tensor.
bool variableChannels = false;
int numChannels = inputs[0][1];
for (size_t i = 1; i < inputs.size(); i++)
{
CV_Assert(inputs[0][0] == inputs[i][0]); // batch sizes are equal
int input_channels = inputs[i][1];
if (numChannels != input_channels)
variableChannels = true;
if (channelsModeInput == ELTWISE_CHANNNELS_SAME)
{
CV_Assert(numChannels == input_channels);
}
else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0)
{
CV_Assert(numChannels >= input_channels);
}
else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
{
// nothing to check
}
else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX)
{
numChannels = std::max(numChannels, input_channels);
}
else
{
CV_Assert(0 && "Internal error");
}
}
channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
outputChannels = numChannels;
outputs.assign(1, inputs[0]);
outputs[0][1] = numChannels;
if (dims > 2)
{
size_t vecIdx = 0;
bool isVecFound = false;
for (size_t i = 0; i < inputs.size(); i++)
{
bool allOnes = isAllOnes(inputs[i], 2, dims);
if (!allOnes && !isVecFound)
{
vecIdx = i;
isVecFound = true;
}
if (!allOnes && i != vecIdx)
{
for (size_t j = 2; j < dims; j++)
{
CV_Assert(inputs[vecIdx][j] == inputs[i][j]);
}
}
}
if (channelsModeInput == ELTWISE_CHANNNELS_SAME && isVecFound)
{
for (size_t j = 2; j < dims; j++)
{
outputs[0][j] = inputs[vecIdx][j];
}
}
}
return false;
}
void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
{
std::vector<Mat> inputs;
inputs_arr.getMatVector(inputs);
for (size_t i = 0; i < inputs.size(); i++)
{
MatShape inpShape = shape(inputs[i].size);
if (isAllOnes(inpShape, 2, inputs[i].dims))
{
hasVecInput = true;
return;
}
}
}
class EltwiseInvoker : public ParallelLoopBody
{
EltwiseLayerInt8Impl& self;
std::vector<const Mat*> srcs;
std::vector<int> srcNumChannels;
int nsrcs;
Mat* dst;
Mat* buf;
std::vector<float> coeffs;
std::vector<int> zeropoints;
int nstripes;
const Mat* activLUT;
const ActivationLayerInt8* activ;
int channels;
size_t planeSize;
float offset;
EltwiseInvoker(EltwiseLayerInt8Impl& self_)
: self(self_)
, nsrcs(0), dst(0), buf(0), nstripes(0), activ(0), channels(0)
, planeSize(0), offset(0)
{}
public:
static void run(EltwiseLayerInt8Impl& self,
const Mat* srcs, int nsrcs, Mat& buf, Mat& dst,
int nstripes, float offset)
{
const EltwiseOp op = self.op;
CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_8SC1, ""); CV_Assert(dst.isContinuous());
CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs);
CV_CheckGE(nsrcs, 2, "");
CV_Assert(self.outputChannels == dst.size[1]);
EltwiseInvoker p(self);
p.srcs.resize(nsrcs);
p.srcNumChannels.resize(nsrcs);
p.coeffs = self.coeffs; // can be sorted
p.zeropoints = self.zeropoints;
bool sortInputs = false;
for( int i = 0; i < nsrcs; i++ )
{
p.srcs[i] = &srcs[i];
CV_CheckEQ(srcs[i].dims, dst.dims, "");
CV_Assert(srcs[i].isContinuous());
CV_Assert(srcs[i].type() == dst.type());
p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1;
if (self.channelsMode == ELTWISE_CHANNNELS_SAME)
{
CV_Assert(srcs[i].size == dst.size);
}
else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0)
{
if (i == 0)
CV_Assert(srcs[0].size == dst.size);
CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
sortInputs = true;
}
else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
{
if (i == 0)
CV_Assert(srcs[0].size == dst.size);
sortInputs = true;
}
else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX)
{
CV_Assert(op == SUM);
CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
sortInputs = true;
}
else
{
CV_Assert(0 && "Internal error");
}
if (sortInputs)
{
// Sort srcs and coefficients in the desc order by number of channels
for (int j = i; j >= 1; j--)
{
if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1]))
{
std::swap(p.srcs[j - 1], p.srcs[j]);
std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]);
if (!p.coeffs.empty())
std::swap(p.coeffs[j - 1], p.coeffs[j]);
if (!p.zeropoints.empty())
std::swap(p.zeropoints[j - 1], p.zeropoints[j]);
}
else
break;
}
}
}
p.nsrcs = nsrcs;
p.dst = &dst;
p.buf = &buf;
p.nstripes = nstripes;
p.offset = offset;
p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, "");
p.activLUT = &self.activationLUT;
p.activ = !self.activationLUT.empty() ? self.activ.get() : 0;
parallel_for_(Range(0, nstripes), p, nstripes);
}
void operator()(const Range& r) const CV_OVERRIDE
{
const EltwiseOp op = self.op;
size_t total = dst->size[0]*planeSize;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(r.end*stripeSize, total);
const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
const int* zeropointsptr = !zeropoints.empty() ? &zeropoints[0] : 0;
const int8_t* lutptr = !activLUT->empty() ? activLUT->ptr<int8_t>() : 0;
int8_t* dstptr0 = dst->ptr<int8_t>();
float* bufptr0 = buf->ptr<float>();
int blockSize0 = 1 << 12;
for (size_t ofs = stripeStart; ofs < stripeEnd; )
{
int sampleIdx = (int)(ofs / planeSize);
int delta = (int)ofs - sampleIdx * planeSize;
int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
if( blockSize <= 0 )
break;
ofs += blockSize;
for (int c = 0; c < channels; c++)
{
size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize;
int8_t* dstptr = dstptr0 + dstIdx;
float* bufptr = bufptr0 + dstIdx;
// process first two inputs
{
const int8_t* srcptr0 = srcs[0]->ptr<int8_t>() + dstIdx;
const int inputIdx = 1;
int src1_channels = srcNumChannels[inputIdx];
if (c >= src1_channels)
{
// no data from second input
if (!coeffsptr)
{
for (int j = 0; j < blockSize; j++)
{
dstptr[j] = srcptr0[j];
}
}
else
{
float c0 = coeffsptr[0];
int z0 = op == PROD ? zeropointsptr[0] : 0;
for (int j = 0; j < blockSize; j++)
{
bufptr[j] = c0 * (srcptr0[j] - z0);
}
}
}
else
{
size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize;
const int8_t* srcptrI = srcs[inputIdx]->ptr<int8_t>() + srcIdx;
if (op == PROD)
{
float c0 = coeffsptr[0];
float c1 = coeffsptr[1];
int z0 = zeropointsptr[0];
int z1 = zeropointsptr[1];
for (int j = 0; j < blockSize; j++)
{
bufptr[j] = (c0*(srcptr0[j] - z0)) * (c1*(srcptrI[j] - z1));
}
}
else if (op == MAX)
{
for (int j = 0; j < blockSize; j++)
{
dstptr[j] = std::max(srcptr0[j], srcptrI[j]);
}
}
else if (op == SUM)
{
float c0 = coeffsptr[0];
float c1 = coeffsptr[1];
for (int j = 0; j < blockSize; j++)
{
bufptr[j] = c0*srcptr0[j] + c1*srcptrI[j];
}
}
else
CV_Error(Error::StsInternal, "");
}
}
// aggregate other inputs (3+)
for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++)
{
int srcI_channels = srcNumChannels[inputIdx];
if (c >= srcI_channels)
continue; // no data from second input
size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize;
const int8_t* srcptrI = srcs[inputIdx]->ptr<int8_t>() + srcIdx;
if (op == PROD)
{
float cI = coeffsptr[inputIdx];
int zI = zeropointsptr[inputIdx];
for (int j = 0; j < blockSize; j++)
{
bufptr[j] *= cI*(srcptrI[j] - zI);
}
}
else if (op == MAX)
{
for (int j = 0; j < blockSize; j++)
{
dstptr[j] = std::max(dstptr[j], srcptrI[j]);
}
}
else if (op == SUM)
{
float cI = coeffsptr[inputIdx];
for (int j = 0; j < blockSize; j++)
{
bufptr[j] += cI * srcptrI[j];
}
}
else
CV_Error(Error::StsInternal, "");
}
// add offset and saturate cast to int8
if (op == SUM || op == PROD)
{
for (int j = 0; j < blockSize; j++)
{
dstptr[j] = saturate_cast<int8_t>(std::round(bufptr[j] + offset));
}
}
}
if( activ )
{
int8_t* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
activ->forwardSlice(ptr, lutptr, ptr, blockSize, planeSize, 0, channels);
}
}
}
};
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
CV_Assert(outputs.size() == 1);
const int nstripes = getNumThreads();
if (channelsModeInput == ELTWISE_CHANNNELS_SAME && inputs[0].dims > 2)
{
for (size_t i = 0; i < inputs.size(); i++)
{
MatShape inpShape = shape(inputs[i].size);
bool allOnes = isAllOnes(inpShape, 2, inputs[i].dims);
if (allOnes)
{
Mat tmpInput = inputs[i];
MatShape outShape = shape(outputs[0].size);
size_t xSize = outShape[2];
for (size_t j = 3; j < outShape.size(); j++)
xSize *= outShape[j];
int dimVec[3] = {outShape[0], outShape[1], (int) xSize};
std::vector<int> matSizesVec(&dimVec[0], &dimVec[0] + 3);
inputs[i] = Mat(matSizesVec, tmpInput.type());
std::vector<int> idx(outShape.size(), 0);
std::vector<int> outIdx(inpShape.size(), 0);
for (size_t j = 0; j < outShape[0]; j++)
{
outIdx[0] = idx[0] = j;
for(size_t k = 0; k < outShape[1]; k++)
{
outIdx[1] = idx[1] = k;
for (size_t x = 0; x < xSize; x++)
{
outIdx[2] = x;
inputs[i].at<int8_t>(outIdx.data()) = tmpInput.at<int8_t>(idx.data());
}
}
}
inputs[i] = inputs[i].reshape(0, outShape);
}
}
}
Mat buf = Mat(shape(outputs[0]), CV_32F); // to store intermediate results
EltwiseInvoker::run(*this, &inputs[0], (int)inputs.size(), buf, outputs[0], nstripes, offset);
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
CV_UNUSED(outputs); // suppress unused variable warning
CV_Assert(inputs.size());
// FIXIT: handle inputs with different number of channels
long flops = inputs.size() * total(inputs[0]);
return flops;
}
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
{
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
if (!activ_int8.empty())
{
activ = activ_int8;
if (!activ_int8->blobs.empty())
activationLUT = activ_int8->blobs[0];
return true;
}
return false;
}
Mat activationLUT;
Ptr<ActivationLayerInt8> activ;
private:
bool hasVecInput;
float offset;
};
Ptr<EltwiseLayerInt8> EltwiseLayerInt8::create(const LayerParams& params)
{
return Ptr<EltwiseLayerInt8>(new EltwiseLayerInt8Impl(params));
}
}
}

@ -0,0 +1,266 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <opencv2/dnn/shape_utils.hpp>
namespace cv
{
namespace dnn
{
class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
{
public:
enum { VEC_ALIGN = 32 };
FullyConnectedLayerInt8Impl(const LayerParams& params)
{
setParamsFrom(params);
output_zp = params.get<int>("zeropoints");
axis = params.get<int>("axis", 1);
if (blobs.size() == 3)
{
// blobs[0] - Weights
// blobs[1] - Bias fused with offset
// blobs[2] - Multipliers for output stage
int numOutput = params.get<int>("num_output");
int innerSize = (int)blobs[0].total() / numOutput;
CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
CV_Assert((size_t)numOutput == blobs[1].total());
weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
int vecsize = weightsMat.cols;
if (vecsize % VEC_ALIGN != 0)
{
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
wpadding.setTo(Scalar::all(0));
weightsMat = weightsBuf.colRange(0, vecsize);
blobs[0].copyTo(weightsMat);
}
biasMat = blobs[1] = blobs[1].reshape(1, 1);
outputMultiplier = blobs[2];
}
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &) const CV_OVERRIDE
{
int numOutput, cAxis;
CV_CheckEQ(inputs.size(), (size_t)1, "");
CV_CheckEQ(blobs[0].dims, 2, "");
numOutput = blobs[0].size[0];
CV_Assert((size_t)numOutput == blobs[1].total());
cAxis = normalize_axis(axis, inputs[0]);
MatShape outShape(cAxis + 1);
for (int i = 0; i < cAxis; ++i)
outShape[i] = inputs[0][i];
outShape.back() = numOutput;
outputs.resize(1, outShape);
return false;
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
}
virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
{
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
if (!activ_int8.empty())
{
activ = activ_int8;
if (!activ_int8->blobs.empty())
activ_int8->blobs[0].convertTo(activationLUT, CV_32S);
return true;
}
return false;
}
class FullyConnected : public ParallelLoopBody
{
public:
FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {}
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
{
CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
srcMat.type() == weights.type() && srcMat.type() == CV_8S &&
dstMat.type() == CV_32S && biasMat.type() == CV_32S &&
biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols );
FullyConnected p;
p.srcMat = &srcMat;
p.weights = &weights;
p.biasMat = &biasMat;
p.outputMultiplier = &outputMultiplier;
p.activationLUT = &activationLUT;
p.dstMat = &dstMat;
p.nstripes = nstripes;
p.outZp = outZp;
p.activ = !activationLUT.empty() ? activ : 0;
p.useAVX2 = checkHardwareSupport(CPU_AVX2);
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
parallel_for_(Range(0, nstripes), p, nstripes);
}
void operator()(const Range& r) const CV_OVERRIDE
{
int valign = FullyConnectedLayerInt8Impl::VEC_ALIGN;
int nsamples = srcMat->rows;
int nw0 = weights->rows;
int k, vecsize = srcMat->cols;
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
size_t total = (size_t)nsamples*nw0;
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
size_t wstep = weights->step1();
AutoBuffer<int8_t> srcbuf(vecsize_aligned + valign);
int8_t* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(int8_t)));
const int* lutptr = !activationLUT->empty() ? activationLUT->ptr<int>() : 0;
for( k = vecsize; k < vecsize_aligned; k++ )
sptr[k] = 0;
for( size_t ofs = stripeStart; ofs < stripeEnd; )
{
int sampleIdx = (int)(ofs / nw0);
int delta = (int)(ofs - (size_t)sampleIdx*nw0);
const int8_t* sptr_ = srcMat->ptr<int8_t>(sampleIdx);
const int8_t* wptr = weights->ptr<int8_t>(delta);
int* dptr = dstMat->ptr<int>(sampleIdx) + delta;
const int* biasptr = biasMat->ptr<int>() + delta;
const float* multptr = outputMultiplier->ptr<float>() + delta;
int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
#if CV_TRY_AVX512_SKX
if( useAVX512 )
opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif
#if CV_TRY_AVX2
if( useAVX2 )
opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
else
#endif
{
int i = 0;
#if CV_SIMD
for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
{
v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
vs2 = v_setzero_s32(), vs3 = v_setzero_s32();
v_int32x4 outzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
v_int32x4 s = v_load(biasptr + i);
v_float32x4 mult = v_load(multptr + i);
for( k = 0; k < vecsize; k += 16 )
{
v_int8x16 v = v_load_aligned(sptr + k);
vs0 = v_dotprod_expand_fast(v, v_load_aligned(wptr + k), vs0);
vs1 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep + k), vs1);
vs2 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*2 + k), vs2);
vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3);
}
s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3));
v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult);
v_store(dptr + i, v_min(v_max(out, outmin), outmax));
}
#endif
for( ; i < nw; i++, wptr += wstep )
{
int s0 = biasptr[i];
float mult0 = multptr[i];
for( k = 0; k < vecsize; k++ )
{
int8_t v = sptr[k];
s0 += (int)v*wptr[k];
}
int out0 = outZp + (int)std::round(s0*mult0);
dptr[i] = std::min(std::max(out0, -128), 127);
}
}
if(activ)
activ->forwardSlice(dptr, lutptr, dptr, 1, 1, delta, delta + nw);
ofs += nw;
}
}
const Mat *srcMat, *weights, *biasMat, *outputMultiplier, *activationLUT;
const ActivationLayerInt8* activ;
Mat* dstMat;
int nstripes, outZp;
bool useAVX2;
bool useAVX512;
};
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
std::vector<Mat> input, output;
inputs_arr.getMatVector(input);
outputs_arr.getMatVector(output);
int axisCan = normalize_axis(axis, input[0].dims);
int outerSize = input[0].total(0, axisCan);
Mat srcMat = input[0].reshape(1, outerSize);
Mat dstMat = output[0].reshape(1, outerSize);
Mat dstMatInt32= Mat(shape(dstMat), CV_32S);
const int nstripes = getNumThreads();
FullyConnected::run(srcMat, weightsMat, biasMat, outputMultiplier, activationLUT, dstMatInt32, activ.get(), nstripes, output_zp);
dstMatInt32.convertTo(dstMat, CV_8S);
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
CV_UNUSED(inputs); // suppress unused variable warning
long flops = 0;
int innerSize = blobs[0].size[1];
for(int i = 0; i < outputs.size(); i++)
{
flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
}
return flops;
}
Mat weightsMat, biasMat, outputMultiplier, activationLUT;
Ptr<ActivationLayerInt8> activ;
};
Ptr<InnerProductLayerInt8> InnerProductLayerInt8::create(const LayerParams& params)
{
return Ptr<InnerProductLayerInt8>(new FullyConnectedLayerInt8Impl(params));
}
}
}

@ -0,0 +1,41 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
#include <opencv2/dnn.hpp>
#include <opencv2/dnn/shape_utils.hpp>
#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
// dispatched AVX/AVX2 optimizations
#include "./layers_common.simd.hpp"
#include "int8layers/layers_common.simd_declarations.hpp"
#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#ifdef HAVE_OPENCL
#include "../ocl4dnn/include/ocl4dnn.hpp"
#endif
namespace cv
{
namespace dnn
{
void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin,
std::vector<size_t>& pads_end, std::vector<size_t>& strides, std::vector<size_t>& dilations,
cv::String &padMode, std::vector<size_t>& adjust_pads);
void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<bool>& globalPooling,
std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end, std::vector<size_t>& strides, cv::String &padMode);
void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel,
const std::vector<size_t>& stride, const String &padMode,
const std::vector<size_t>& dilation, std::vector<int>& out);
void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
const std::vector<size_t>& strides, const String &padMode,
std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end);
}
}
#endif

@ -0,0 +1,637 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "opencv2/core/hal/intrin.hpp"
namespace cv {
namespace dnn {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
void fastConv( const int8_t* weights, size_t wstep, const int* bias,
const int8_t* rowbuf, int* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned, int outZp,
const float* multiplier, bool initOutput, bool finalOutput );
void fastDepthwiseConv( const int8_t* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const int* biasptr, const float* multptr,
const int8_t* inptr_,
int height, int width,
int* outptr_,
int out_d, int outH, int outW,
int inpZp, int outZp );
void fastGEMM1T( const int8_t* vec, const int8_t* weights,
size_t wstep, const int* bias, const float* multiplier,
int* dst, int nvecs, int vecsize, int outZp );
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX2
#define OPENCV_FMADD_EPI8(_Tpvec, func) \
inline _Tpvec _##func##_fmaddepi8_epi32(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
{ \
_Tpvec even_a = _##func##_srai_epi16(_##func##_bslli_epi128(a, 1), 8); \
_Tpvec odd_a = _##func##_srai_epi16(a, 8); \
\
_Tpvec even_b = _##func##_srai_epi16(_##func##_bslli_epi128(b, 1), 8); \
_Tpvec odd_b = _##func##_srai_epi16(b, 8); \
\
_Tpvec prod0 = _##func##_madd_epi16(even_a, even_b); \
_Tpvec prod1 = _##func##_madd_epi16(odd_a, odd_b); \
return _##func##_add_epi32(_##func##_add_epi32(prod0, prod1), c); \
}
OPENCV_FMADD_EPI8(__m256i, mm256)
//OPENCV_FMADD_EPI8(__m512i, mm512)
enum { FASCONV_BASE_VECSZ = 4 };
void fastConv( const int8_t* weights, size_t wstep, const int* bias,
const int8_t* rowbuf, int* output, const int* outShape,
int blockSize, int vecsize, int vecsize_aligned, int outZp,
const float* multiplier, bool initOutput, bool finalOutput )
{
int outCn = outShape[1];
size_t outPlaneSize = outShape[2]*outShape[3];
int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
int rsz = blockSize % FASCONV_BASE_VECSZ;
for( int i = 0; i < rsz; i++ )
maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
__m128 mask = _mm_loadu_ps((const float*)maskbuf);
// now compute dot product of the weights
// and im2row-transformed part of the tensor
for( int i = 0; i < outCn; i += 3 )
{
const int8_t* wptr0 = weights + i*wstep;
const int8_t* wptr1 = wptr0 + wstep;
const int8_t* wptr2 = wptr1 + wstep;
int* outptr0 = output + i*outPlaneSize;
int* outptr1 = outptr0 + outPlaneSize;
int* outptr2 = outptr1 + outPlaneSize;
int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
if( i+2 >= outCn )
{
wptr2 = wptr1;
outptr2 = outptr1;
bias2 = bias1;
mult2 = mult1;
if( i+1 >= outCn )
{
wptr2 = wptr1 = wptr0;
outptr2 = outptr1 = outptr0;
bias2 = bias1 = bias0;
mult2 = mult1 = mult0;
}
}
int j = 0;
for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
{
bool tail = false;
if (j + FASCONV_BASE_VECSZ > blockSize)
{
if (j == 0)
break;
j = blockSize - FASCONV_BASE_VECSZ;
tail = true;
}
int k = 0;
const int8_t* rptr = rowbuf + j*vecsize_aligned;
__m256i vs00 = _mm256_setzero_si256(), vs01 = _mm256_setzero_si256(),
vs02 = _mm256_setzero_si256(), vs03 = _mm256_setzero_si256(),
vs10 = _mm256_setzero_si256(), vs11 = _mm256_setzero_si256(),
vs12 = _mm256_setzero_si256(), vs13 = _mm256_setzero_si256(),
vs20 = _mm256_setzero_si256(), vs21 = _mm256_setzero_si256(),
vs22 = _mm256_setzero_si256(), vs23 = _mm256_setzero_si256();
/* TODO : Fix AVX-512 path. Segmentation fault in Conv2D Tests.
#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling
if (vecsize >= 64)
{
__m512i vs00_5 = _mm512_setzero_si512(), vs01_5 = _mm512_setzero_si512(),
vs02_5 = _mm512_setzero_si512(), vs03_5 = _mm512_setzero_si512(),
vs10_5 = _mm512_setzero_si512(), vs11_5 = _mm512_setzero_si512(),
vs12_5 = _mm512_setzero_si512(), vs13_5 = _mm512_setzero_si512(),
vs20_5 = _mm512_setzero_si512(), vs21_5 = _mm512_setzero_si512(),
vs22_5 = _mm512_setzero_si512(), vs23_5 = _mm512_setzero_si512();
for (; k <= vecsize - 64; k += 64, rptr += 64)
{
__m512i w0 = _mm512_load_si512(wptr0 + k);
__m512i w1 = _mm512_load_si512(wptr1 + k);
__m512i w2 = _mm512_load_si512(wptr2 + k);
__m512i r0 = _mm512_load_si512(rptr);
vs00_5 = _mm512_fmaddepi8_epi32(w0, r0, vs00_5);
vs10_5 = _mm512_fmaddepi8_epi32(w1, r0, vs10_5);
vs20_5 = _mm512_fmaddepi8_epi32(w2, r0, vs20_5);
r0 = _mm512_load_si512(rptr + vecsize_aligned);
vs01_5 = _mm512_fmaddepi8_epi32(w0, r0, vs01_5);
vs11_5 = _mm512_fmaddepi8_epi32(w1, r0, vs11_5);
vs21_5 = _mm512_fmaddepi8_epi32(w2, r0, vs21_5);
r0 = _mm512_load_si512(rptr + vecsize_aligned*2);
vs02_5 = _mm512_fmaddepi8_epi32(w0, r0, vs02_5);
vs12_5 = _mm512_fmaddepi8_epi32(w1, r0, vs12_5);
vs22_5 = _mm512_fmaddepi8_epi32(w2, r0, vs22_5);
r0 = _mm512_load_si512(rptr + vecsize_aligned*3);
vs03_5 = _mm512_fmaddepi8_epi32(w0, r0, vs03_5);
vs13_5 = _mm512_fmaddepi8_epi32(w1, r0, vs13_5);
vs23_5 = _mm512_fmaddepi8_epi32(w2, r0, vs23_5);
}
// now fold the 512 bit accumulator vectors into 256 bit vectors so that the AVX2 code can finish
// the tail of the vector
vs00 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs00_5, 0), _mm512_extracti32x8_epi32(vs00_5, 1));
vs10 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs10_5, 0), _mm512_extracti32x8_epi32(vs10_5, 1));
vs20 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs20_5, 0), _mm512_extracti32x8_epi32(vs20_5, 1));
vs01 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs01_5, 0), _mm512_extracti32x8_epi32(vs01_5, 1));
vs11 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs11_5, 0), _mm512_extracti32x8_epi32(vs11_5, 1));
vs21 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs21_5, 0), _mm512_extracti32x8_epi32(vs21_5, 1));
vs02 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs02_5, 0), _mm512_extracti32x8_epi32(vs02_5, 1));
vs12 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs12_5, 0), _mm512_extracti32x8_epi32(vs12_5, 1));
vs22 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs22_5, 0), _mm512_extracti32x8_epi32(vs22_5, 1));
vs03 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs03_5, 0), _mm512_extracti32x8_epi32(vs03_5, 1));
vs13 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs13_5, 0), _mm512_extracti32x8_epi32(vs13_5, 1));
vs23 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs23_5, 0), _mm512_extracti32x8_epi32(vs23_5, 1));
}
#endif
*/
for (; k < vecsize; k += 32, rptr += 32 )
{
__m256i w0 = _mm256_load_si256((const __m256i*)(wptr0 + k));
__m256i w1 = _mm256_load_si256((const __m256i*)(wptr1 + k));
__m256i w2 = _mm256_load_si256((const __m256i*)(wptr2 + k));
__m256i r0 = _mm256_load_si256((const __m256i*)rptr);
vs00 = _mm256_fmaddepi8_epi32(w0, r0, vs00);
vs10 = _mm256_fmaddepi8_epi32(w1, r0, vs10);
vs20 = _mm256_fmaddepi8_epi32(w2, r0, vs20);
r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned));
vs01 = _mm256_fmaddepi8_epi32(w0, r0, vs01);
vs11 = _mm256_fmaddepi8_epi32(w1, r0, vs11);
vs21 = _mm256_fmaddepi8_epi32(w2, r0, vs21);
r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*2));
vs02 = _mm256_fmaddepi8_epi32(w0, r0, vs02);
vs12 = _mm256_fmaddepi8_epi32(w1, r0, vs12);
vs22 = _mm256_fmaddepi8_epi32(w2, r0, vs22);
r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*3));
vs03 = _mm256_fmaddepi8_epi32(w0, r0, vs03);
vs13 = _mm256_fmaddepi8_epi32(w1, r0, vs13);
vs23 = _mm256_fmaddepi8_epi32(w2, r0, vs23);
}
__m256i t0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs00, vs01), _mm256_hadd_epi32(vs02, vs03));
__m256i t1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs10, vs11), _mm256_hadd_epi32(vs12, vs13));
__m256i t2 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs20, vs21), _mm256_hadd_epi32(vs22, vs23));
t0 = _mm256_add_epi32(t0, _mm256_permute2x128_si256(t0, t0, 1));
t1 = _mm256_add_epi32(t1, _mm256_permute2x128_si256(t1, t1, 1));
t2 = _mm256_add_epi32(t2, _mm256_permute2x128_si256(t2, t2, 1));
__m128i s0, s1, s2;
if( initOutput )
{
s0 = _mm_set1_epi32(bias0);
s1 = _mm_set1_epi32(bias1);
s2 = _mm_set1_epi32(bias2);
}
else
{
s0 = _mm_loadu_si128((__m128i*)(outptr0 + j));
s1 = _mm_loadu_si128((__m128i*)(outptr1 + j));
s2 = _mm_loadu_si128((__m128i*)(outptr2 + j));
}
s0 = _mm_add_epi32(s0, _mm256_castsi256_si128(t0));
s1 = _mm_add_epi32(s1, _mm256_castsi256_si128(t1));
s2 = _mm_add_epi32(s2, _mm256_castsi256_si128(t2));
if( finalOutput )
{
__m128i voutzp = _mm_set1_epi32(outZp);
__m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127);
s0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s0), _mm_set1_ps(mult0))));
s1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s1), _mm_set1_ps(mult1))));
s2 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s2), _mm_set1_ps(mult2))));
s0 = _mm_min_epi32(_mm_max_epi32(s0, outmin), outmax);
s1 = _mm_min_epi32(_mm_max_epi32(s1, outmin), outmax);
s2 = _mm_min_epi32(_mm_max_epi32(s2, outmin), outmax);
}
if( tail )
{
s0 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr0 + j), _mm_castsi128_ps(s0), mask));
s1 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr1 + j), _mm_castsi128_ps(s1), mask));
s2 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr2 + j), _mm_castsi128_ps(s2), mask));
}
_mm_storeu_si128((__m128i*)(outptr0 + j), s0);
_mm_storeu_si128((__m128i*)(outptr1 + j), s1);
_mm_storeu_si128((__m128i*)(outptr2 + j), s2);
}
for( ; j <= blockSize - 2; j += 2 )
{
const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned;
int s00, s01, s10, s11, s20, s21;
if( initOutput )
{
s00 = s01 = bias0;
s10 = s11 = bias1;
s20 = s21 = bias2;
}
else
{
s00 = outptr0[j]; s01 = outptr0[j+1];
s10 = outptr1[j]; s11 = outptr1[j+1];
s20 = outptr2[j]; s21 = outptr2[j+1];
}
for( int k = 0; k < vecsize; k++ )
{
int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
int8_t r = rptr0[k];
s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
r = rptr1[k];
s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r;
}
if( finalOutput )
{
s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127);
s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127);
s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127);
}
outptr0[j] = s00;
outptr0[j+1] = s01;
outptr1[j] = s10;
outptr1[j+1] = s11;
outptr2[j] = s20;
outptr2[j+1] = s21;
}
for( ; j < blockSize; j++ )
{
const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
int s00, s10, s20;
if( initOutput )
{
s00 = bias0;
s10 = bias1;
s20 = bias2;
}
else
{
s00 = outptr0[j];
s10 = outptr1[j];
s20 = outptr2[j];
}
for( int k = 0; k < vecsize; k++ )
{
int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
int8_t r = rptr0[k];
s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
}
if( finalOutput )
{
s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
}
outptr0[j] = s00;
outptr1[j] = s10;
outptr2[j] = s20;
}
}
_mm256_zeroupper();
}
static inline void _mm256_expand_mul_add(const __m256i& a, const __m256i& b,
__m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3)
{
__m256i a0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a));
__m256i a1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a, 1));
__m256i b0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(b));
__m256i b1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b, 1));
__m256i a0b0 = _mm256_mullo_epi16(a0, b0);
__m256i a1b1 = _mm256_mullo_epi16(a1, b1);
out0 = _mm256_add_epi32(out0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a0b0)));
out1 = _mm256_add_epi32(out1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a0b0, 1)));
out2 = _mm256_add_epi32(out2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a1b1)));
out3 = _mm256_add_epi32(out3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a1b1, 1)));
}
static inline void _mm256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b)
{
__m256i t0 = _mm256_loadu_si256((const __m256i*)ptr);
__m256i t1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
__m256i p0 = _mm256_shuffle_epi8(t0, sh);
__m256i p1 = _mm256_shuffle_epi8(t1, sh);
__m256i lo = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
__m256i hi = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
a = _mm256_unpacklo_epi64(lo, hi);
b = _mm256_unpackhi_epi64(lo, hi);
}
void fastDepthwiseConv( const int8_t* wptr,
int kernel_h, int kernel_w,
int stride_h, int stride_w,
int dilation_h, int dilation_w,
int pad_t, int pad_l,
const int* biasptr, const float* multptr,
const int8_t* inptr_,
int height, int width,
int* outptr_,
int out_d, int outH, int outW,
int inpZp, int outZp)
{
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float mult = multptr[out_d];
int bias = biasptr[out_d];
int biasCopy;
for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const int8_t* imgptr0 = inptr_ + in_i*width;
const int8_t* imgptr1 = imgptr0 + dilation_h*width;
const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
int8_t w00 = w00_, w01 = w01_, w02 = w02_;
int8_t w20 = w20_, w21 = w21_, w22 = w22_;
int out;
biasCopy = bias;
if (in_i < 0)
{
biasCopy += inpZp * (w00 + w01 + w02);
w00 = w01 = w02 = 0;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
biasCopy += inpZp * (w20 + w21 + w22);
w20 = w21 = w22 = 0;
imgptr2 = imgptr1;
}
int* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
(int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
(int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
biasCopy + inpZp*(w00 + w10 + w20);
outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
out_j = 1;
}
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
{
const int VECSZ = 32;
__m256i vw00 = _mm256_set1_epi8(w00), vw01 = _mm256_set1_epi8(w01), vw02 = _mm256_set1_epi8(w02),
vw10 = _mm256_set1_epi8(w10), vw11 = _mm256_set1_epi8(w11), vw12 = _mm256_set1_epi8(w12),
vw20 = _mm256_set1_epi8(w20), vw21 = _mm256_set1_epi8(w21), vw22 = _mm256_set1_epi8(w22);
__m256i vbias = _mm256_set1_epi32(biasCopy), voutzp = _mm256_set1_epi32(outZp),
outmin = _mm256_set1_epi32(-128), outmax = _mm256_set1_epi32(127);
__m256 vmult = _mm256_set1_ps(mult);
__m256i vout0, vout1, vout2, vout3;
if( stride_w == 1 )
{
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1)
{
if (out_j <= pad_l)
break;
out_j = outW1 - VECSZ;
}
int in_j = out_j * stride_w - pad_l;
__m256i v00 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j)),
v01 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w)),
v02 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w*2)),
v10 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j)),
v11 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w)),
v12 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w*2)),
v20 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j)),
v21 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w)),
v22 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w*2));
vout0 = vout1 = vout2 = vout3 = vbias;
_mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult)));
vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult)));
vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult)));
vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult)));
vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax);
vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax);
vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax);
vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax);
_mm256_storeu_si256((__m256i*)(outptr + out_j), vout0);
_mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1);
_mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2);
_mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3);
}
}
else
{
for( ; out_j < outW1; out_j += VECSZ )
{
if (out_j + VECSZ > outW1)
{
if (out_j <= pad_l)
break;
out_j = outW1 - VECSZ;
}
int in_j = out_j * stride_w - pad_l;
__m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
vout0 = vout1 = vout2 = vout3 = vbias;
_mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
_mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult)));
vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult)));
vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult)));
vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult)));
vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax);
vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax);
vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax);
vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax);
_mm256_storeu_si256((__m256i*)(outptr + out_j), vout0);
_mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1);
_mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2);
_mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3);
}
}
}
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
(int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
(int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
}
for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
int s0 = 1, s1 = 1, s2 = 1;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0;
biasCopy += inpZp*(w00 + w10 + w20);
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0;
biasCopy += inpZp*(w01 + w11 + w21);
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0;
biasCopy += inpZp*(w02 + w12 + w22);
}
out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
(int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
(int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
}
}
_mm256_zeroupper();
}
// dst = vec * weights^t + bias
void fastGEMM1T( const int8_t* vec, const int8_t* weights,
size_t wstep, const int* bias, const float* multiplier,
int* dst, int nvecs, int vecsize, int outZp )
{
int i = 0;
for( ; i <= nvecs - 8; i += 8 )
{
const int8_t* wptr = weights + i*wstep;
__m256i vs0 = _mm256_setzero_si256(), vs1 = _mm256_setzero_si256(),
vs2 = _mm256_setzero_si256(), vs3 = _mm256_setzero_si256(),
vs4 = _mm256_setzero_si256(), vs5 = _mm256_setzero_si256(),
vs6 = _mm256_setzero_si256(), vs7 = _mm256_setzero_si256();
__m128i voutzp = _mm_set1_epi32(outZp);
__m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127);
for( int k = 0; k < vecsize; k += 32, wptr += 32 )
{
__m256i v = _mm256_load_si256((const __m256i*)(vec + k));
vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0);
vs1 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep)), v, vs1);
vs2 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*2)), v, vs2);
vs3 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*3)), v, vs3);
vs4 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*4)), v, vs4);
vs5 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*5)), v, vs5);
vs6 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*6)), v, vs6);
vs7 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*7)), v, vs7);
}
__m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs1), _mm256_hadd_epi32(vs2, vs3));
__m256i s1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs4, vs5), _mm256_hadd_epi32(vs6, vs7));
s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1));
s1 = _mm256_add_epi32(s1, _mm256_permute2x128_si256(s1, s1, 1));
__m128i t0 = _mm_add_epi32(_mm256_castsi256_si128(s0), _mm_loadu_si128((__m128i*)(bias + i)));
__m128i t1 = _mm_add_epi32(_mm256_castsi256_si128(s1), _mm_loadu_si128((__m128i*)(bias + i + 4)));
t0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t0), _mm_loadu_ps(multiplier + i))));
t1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t1), _mm_loadu_ps(multiplier + i + 4))));
t0 = _mm_min_epi32(_mm_max_epi32(t0, outmin), outmax);
t1 = _mm_min_epi32(_mm_max_epi32(t1, outmin), outmax);
_mm_storeu_si128((__m128i*)(dst + i), t0);
_mm_storeu_si128((__m128i*)(dst + i + 4), t1);
}
for( ; i < nvecs; i++ )
{
const int8_t* wptr = weights + i*wstep;
__m256i vs0 = _mm256_setzero_si256();
for( int k = 0; k < vecsize; k += 32, wptr += 32 )
{
__m256i v = _mm256_load_si256((const __m256i*)(vec + k));
vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0);
}
__m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs0), vs0);
s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1));
int temp = _mm_extract_epi32(_mm256_castsi256_si128(s0), 0);
dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]);
}
_mm256_zeroupper();
}
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END
}} // namespace

@ -0,0 +1,595 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include <float.h>
#include <algorithm>
#include <numeric>
using std::max;
using std::min;
namespace cv
{
namespace dnn
{
class PoolingLayerInt8Impl CV_FINAL : public PoolingLayerInt8
{
public:
PoolingLayerInt8Impl(const LayerParams& params)
{
computeMaxIdx = false;
globalPooling = false;
isGlobalPooling = std::vector<bool>(3, false);
output_zp = params.get<int>("zeropoints");
input_zp = params.get<int>("input_zeropoint", 0);
multiplier = params.get<float>("multiplier", 1.f);
hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
shapesInitialized = !hasDynamicShapes;
if (params.has("pool") || params.has("kernel_size") ||
params.has("kernel_w") || params.has("kernel_h"))
{
String pool = toLowerCase(params.get<String>("pool", "max"));
if (pool == "max")
type = MAX;
else if (pool == "ave")
type = AVE;
else if (pool == "sum")
type = SUM;
else
CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
getPoolingKernelParams(params, kernel_size, isGlobalPooling, pads_begin, pads_end, strides, padMode);
globalPooling = isGlobalPooling[0] || isGlobalPooling[1] || isGlobalPooling[2];
}
else
CV_Error(Error::StsBadArg, "Cannot determine pooling type");
setParamsFrom(params);
ceilMode = params.get<bool>("ceil_mode", true);
spatialScale = params.get<float>("spatial_scale", 1);
avePoolPaddedArea = params.get<bool>("ave_pool_padded_area", true);
}
void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
{
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
CV_Assert(!inputs.empty());
CV_Assert(outputs.size() == 1);
std::vector<int> inp;
std::vector<int> out;
for (int i = 2; i < inputs[0].dims; i++) {
inp.push_back(inputs[0].size[i]);
out.push_back(outputs[0].size[i]);
}
if (globalPooling) {
std::vector<size_t> finalKernel;
for (int i = 0; i < inp.size(); i++) {
int idx = isGlobalPooling.size() - inp.size() + i;
finalKernel.push_back(isGlobalPooling[idx] ? inp[i] : kernel_size[idx]);
}
kernel_size = finalKernel;
}
getConvPoolPaddings(inp, kernel_size, strides, padMode, pads_begin, pads_end);
if (inputs[0].dims == 3)
{
// Pool1D
kernel_size.assign(1, kernel_size[0]);
strides.assign(1, strides[0]);
pads_begin.assign(1, pads_begin[0]);
pads_end.assign(1, pads_end[0]);
}
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
if (backendId == DNN_BACKEND_OPENCV)
{
if (kernel_size.size() == 3)
return preferableTarget == DNN_TARGET_CPU;
if (kernel_size.size() <= 2)
return true;
else
return false;
}
return false;
}
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
{
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
if (!activ_int8.empty())
{
return activ_int8->blobs.empty();
}
return false;
}
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
switch (type)
{
case MAX:
{
CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
maxPooling(inputs[0], outputs[0]);
break;
}
case AVE: case SUM:
CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
avePooling(inputs[0], outputs[0]);
break;
default:
CV_Error(Error::StsNotImplemented, "Not implemented");
break;
}
}
class PoolingInvoker : public ParallelLoopBody
{
public:
const Mat* src, *rois;
Mat *dst;
int pad_l, pad_t, pad_r, pad_b;
bool avePoolPaddedArea;
int nstripes, inpZp, outZp;
std::vector<int> ofsbuf;
int poolingType;
float multiplier;
float spatialScale;
std::vector<size_t> pads_begin, pads_end;
std::vector<size_t> kernel_size;
std::vector<size_t> strides;
PoolingInvoker() : src(0), rois(0), dst(0), pad_l(0), pad_t(0), pad_r(0), pad_b(0),
avePoolPaddedArea(false), nstripes(0), inpZp(0), outZp(0),
poolingType(MAX), multiplier(1), spatialScale(0){}
static void run(const Mat& src, const Mat& rois, Mat& dst,
std::vector<size_t> kernel_size, std::vector<size_t> strides,
std::vector<size_t> pads_begin, std::vector<size_t> pads_end,
bool avePoolPaddedArea, int poolingType, float spatialScale,
float multiplier, int inpZp, int outZp, int nstripes)
{
CV_Assert_N(
src.isContinuous(), dst.isContinuous(),
src.type() == CV_8S, src.type() == dst.type(),
src.dims == 3 || src.dims == 4 || src.dims == 5, dst.dims == 3 || dst.dims == 4 || dst.dims == 5,
src.size[0] == dst.size[0], src.size[1] == dst.size[1], rois.empty());
PoolingInvoker p;
bool isPool1D = src.dims == 3;
bool isPool3D = src.dims == 5;
p.src = &src;
p.rois = &rois;
p.dst = &dst;
p.kernel_size = kernel_size;
p.strides = strides;
p.pads_begin = pads_begin;
p.pads_end = pads_end;
p.pad_l = pads_begin.back();
p.pad_t = isPool1D ? 0 : pads_begin[pads_begin.size() - 2];
p.pad_r = pads_end.back();
p.pad_b = isPool1D ? 0 : pads_end[pads_end.size() - 2];
p.avePoolPaddedArea = avePoolPaddedArea;
p.nstripes = nstripes;
p.inpZp = inpZp;
p.outZp = outZp;
p.poolingType = poolingType;
p.spatialScale = spatialScale;
p.multiplier = multiplier;
int height = isPool1D ? 1 : src.size[src.dims - 2];
int width = src.size[src.dims - 1];
int kernel_d = isPool3D ? kernel_size[0] : 1;
int kernel_h = isPool1D ? 1 : kernel_size[kernel_size.size() - 2];
int kernel_w = kernel_size.back();
p.ofsbuf.resize(kernel_d * kernel_h * kernel_w);
for (int i = 0; i < kernel_d; ++i) {
for (int j = 0; j < kernel_h; ++j) {
for (int k = 0; k < kernel_w; ++k) {
p.ofsbuf[i * kernel_h * kernel_w + j * kernel_w + k] = width * height * i + width * j + k;
}
}
}
parallel_for_(Range(0, nstripes), p, nstripes);
}
void operator()(const Range& r) const CV_OVERRIDE
{
int channels = dst->size[1];
bool isPool3D = src->dims == 5;
bool isPool2D = src->dims == 4;
bool isPool1D = src->dims == 3;
int depth = isPool3D? dst->size[2] : 1;
int height = isPool1D? 1 : dst->size[dst->dims - 2];
int width = dst->size[dst->dims - 1];
int inp_depth = isPool3D? src->size[2] : 1;
int inp_height = isPool1D? 1 : src->size[src->dims - 2];
int inp_width = src->size[src->dims - 1];
size_t total = dst->total();
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(r.end*stripeSize, total);
int kernel_d = isPool3D? kernel_size[0] : 1;
int kernel_h = isPool1D? 1 : kernel_size[kernel_size.size() - 2];
int kernel_w = kernel_size.back();
int stride_d = isPool3D? strides[0] : 0;
int stride_h = isPool1D? 1 :strides[strides.size() - 2];
int stride_w = strides.back();
#if CV_SIMD128
const int* ofsptr = (const int*)&ofsbuf[0];
if (poolingType == MAX && !ofsptr)
CV_Error(Error::StsBadArg, "ofsbuf should be initialized in this mode");
#endif
for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
{
size_t ofs = ofs0;
int x0 = (int)(ofs % width);
ofs /= width;
int y0 = (int)(ofs % height);
ofs /= height;
int d0 = (int)(ofs % depth);
ofs /= depth;
int c = (int)(ofs % channels);
int n = (int)(ofs / channels);
int ystart, yend;
int dstart = 0, dend = 1;
const int8_t *srcData = 0;
int pad_d_begin = (pads_begin.size() == 3) ? pads_begin[0] : 0;
dstart = d0 * stride_d - pad_d_begin;
dend = min(dstart + kernel_d, (int)(inp_depth + pads_end[0]));
ystart = y0 * stride_h - pad_t;
yend = min(ystart + kernel_h, inp_height + pad_b);
srcData = src->ptr<int8_t>(n, c);
int ddelta = dend - dstart;
dstart = max(dstart, 0);
dend = min(dend, inp_depth);
int ydelta = yend - ystart;
ystart = max(ystart, 0);
yend = min(yend, inp_height);
int8_t *dstData = &dst->ptr<int8_t>(n, c, d0)[y0 * width];
int delta = std::min((int)(stripeEnd - ofs0), width - x0);
ofs0 += delta;
int x1 = x0 + delta;
if( poolingType == MAX )
for( ; x0 < x1; x0++ )
{
int xstart = x0 * stride_w - pad_l;
int xend = min(xstart + kernel_w, inp_width);
xstart = max(xstart, 0);
if (xstart >= xend || ystart >= yend)
{
dstData[x0] = (int8_t)outZp;
continue;
}
#if CV_SIMD128
if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width )
{
v_int8x16 max_val0 = v_setall_s8(-128);
if( yend - ystart == kernel_h )
{
const int8_t* srcData1 = srcData + ystart*inp_width + xstart;
if( stride_w == 1 )
for (int k = 0; k < kernel_w*kernel_h; k++)
{
int index = ofsptr[k];
v_int8x16 v0 = v_load(srcData1 + index);
max_val0 = v_max(max_val0, v0);
}
else if( stride_w == 2 )
for (int k = 0; k < kernel_w*kernel_h; k++)
{
int index = ofsptr[k];
v_int8x16 v0, dummy;
v_load_deinterleave(srcData1 + index, v0, dummy);
max_val0 = v_max(max_val0, v0);
}
else
for (int k = 0; k < kernel_w*kernel_h; k++)
{
int index = ofsptr[k];
v_int8x16 v0(srcData1[index], srcData1[index + stride_w],
srcData1[index + stride_w*2], srcData1[index + stride_w*3],
srcData1[index + stride_w*4], srcData1[index + stride_w*5],
srcData1[index + stride_w*6], srcData1[index + stride_w*7],
srcData1[index + stride_w*8], srcData1[index + stride_w*9],
srcData1[index + stride_w*10], srcData1[index + stride_w*11],
srcData1[index + stride_w*12], srcData1[index + stride_w*13],
srcData1[index + stride_w*14], srcData1[index + stride_w*15]);
max_val0 = v_max(max_val0, v0);
}
}
else
{
for (int y = ystart; y < yend; ++y)
{
for (int x = xstart; x < xend; ++x)
{
const int index = y * inp_width + x;
v_int8x16 v0(srcData[index], srcData[index + stride_w],
srcData[index + stride_w*2], srcData[index + stride_w*3],
srcData[index + stride_w*4], srcData[index + stride_w*5],
srcData[index + stride_w*6], srcData[index + stride_w*7],
srcData[index + stride_w*8], srcData[index + stride_w*9],
srcData[index + stride_w*10], srcData[index + stride_w*11],
srcData[index + stride_w*12], srcData[index + stride_w*13],
srcData[index + stride_w*14], srcData[index + stride_w*15]);
max_val0 = v_max(max_val0, v0);
}
}
}
v_store(dstData + x0, max_val0);
x0 += 15;
}
else
#else
CV_UNUSED(isPool2D);
#endif
if( isPool1D )
{
const int8_t* first = srcData + xstart;
const int8_t* last = srcData + xend;
const int8_t* max_elem = std::max_element(first, last);
if (max_elem != last)
dstData[x0] = *max_elem;
}
else
{
int8_t max_val = -128;
for (int d = dstart; d < dend; ++d) {
for (int y = ystart; y < yend; ++y) {
for (int x = xstart; x < xend; ++x) {
const int index = d * inp_width * inp_height + y * inp_width + x;
int8_t val = srcData[index];
max_val = std::max(max_val, val);
}
}
}
dstData[x0] = max_val;
}
}
else if (poolingType == AVE || poolingType == SUM)
{
for( ; x0 < x1; ++x0)
{
int xstart = x0 * stride_w - pad_l;
int xend = min(xstart + kernel_w, inp_width + pad_r);
int xdelta = xend - xstart;
xstart = max(xstart, 0);
xend = min(xend, inp_width);
int real_kernel_area = (dend - dstart) * (yend - ystart) * (xend - xstart);
int padded_kernel_area = xdelta * ydelta * ddelta;
int kernel_area = avePoolPaddedArea ? padded_kernel_area : real_kernel_area;
int bias = (avePoolPaddedArea ? (padded_kernel_area - real_kernel_area) * inpZp : 0)
- (inpZp * kernel_area);
float inv_kernel_area = poolingType == AVE ? multiplier / kernel_area : multiplier;
#if CV_SIMD128
if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width )
{
v_int32x4 sum_val0 = v_setall_s32(bias), sum_val1 = v_setall_s32(bias),
sum_val2 = v_setall_s32(bias), sum_val3 = v_setall_s32(bias),
voutzp = v_setall_s32(outZp);
v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
for (int y = ystart; y < yend; ++y)
{
for (int x = xstart; x < xend; ++x)
{
const int index = y * inp_width + x;
v_int32x4 v0((int)srcData[index], (int)srcData[index + stride_w],
(int)srcData[index + stride_w*2], (int)srcData[index + stride_w*3]);
v_int32x4 v1((int)srcData[index + stride_w*4], (int)srcData[index + stride_w*5],
(int)srcData[index + stride_w*6], (int)srcData[index + stride_w*7]);
v_int32x4 v2((int)srcData[index + stride_w*8], (int)srcData[index + stride_w*9],
(int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]);
v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13],
(int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]);
sum_val0 += v0;
sum_val1 += v1;
sum_val2 += v2;
sum_val3 += v3;
}
}
sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp;
sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp;
sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp;
sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp;
v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3)));
x0 += 15;
}
else
#endif
if( isPool1D )
{
const int8_t* first = srcData + xstart;
const int8_t* last = srcData + xend;
int sum_val = bias + std::accumulate(first, last, 0);
dstData[x0] = saturate_cast<int8_t>(outZp + std::round(sum_val*inv_kernel_area));
}
else
{
int sum_val = bias;
for (int d = dstart; d < dend; ++d) {
for (int y = ystart; y < yend; ++y) {
for (int x = xstart; x < xend; ++x) {
const int index = d * inp_width * inp_height + y * inp_width + x;
int8_t val = srcData[index];
sum_val += (int)val;
}
}
}
dstData[x0] = saturate_cast<int8_t>(outZp + std::round(sum_val*inv_kernel_area));
}
}
}
}
}
};
void maxPooling(Mat &src, Mat &dst)
{
const int nstripes = getNumThreads();
Mat rois;
PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type,
spatialScale, multiplier, input_zp, output_zp, nstripes);
}
void avePooling(Mat &src, Mat &dst)
{
const int nstripes = getNumThreads();
Mat rois;
PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type,
spatialScale, multiplier, input_zp, output_zp, nstripes);
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
CV_Assert(inputs.size() != 0);
bool isPool1D = inputs[0].size() == 3;
std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
std::vector<int> outShape(inputs[0].begin(), inputs[0].begin() + 2);
std::vector<size_t> local_kernel;
if (globalPooling) {
for (int i = 0; i < inpShape.size(); i++) {
int idx = isGlobalPooling.size() - inpShape.size() + i;
local_kernel.push_back(isGlobalPooling[idx] ? inpShape[i] : kernel_size[idx]);
}
} else {
local_kernel = kernel_size;
}
if (hasDynamicShapes && !shapesInitialized)
{
//Just copy input shapes for width and height to prevent errors on loading stage
for (int i = 0; i < inpShape.size(); i++)
outShape.push_back(inpShape[i]);
}
else if (padMode.empty())
{
int addedDims = isPool1D? inpShape.size() : local_kernel.size();
for (int i = 0; i < addedDims; i++) {
float dst = (float) (inpShape[i] + pads_begin[i] + pads_end[i] - local_kernel[i]) / strides[i];
outShape.push_back(1 + (ceilMode ? ceil(dst) : floor(dst)));
}
// If we have padding, ensure that the last pooling starts strictly
// inside the image (instead of at the padding); otherwise clip the last.
for (int i = 0; i < addedDims; i++) {
if (pads_end[i] && (outShape[2 + i] - 1) * strides[i] >= inpShape[i] + pads_end[i]) {
--outShape[2 + i];
CV_Assert((outShape[2 + i] - 1) * strides[i] < inpShape[i] + pads_end[i]);
}
}
}
else {
getConvPoolOutParams(inpShape, local_kernel, strides, padMode,
std::vector<size_t>(local_kernel.size(), 1), outShape);
}
outputs.assign(1, outShape);
return false;
}
bool updateMemoryShapes(const std::vector<MatShape> &inputs) CV_OVERRIDE
{
int dims = inputs[0].size();
CV_Assert(inputs[0][dims - 1] > 0 && inputs[0][dims - 2] > 0);
shapesInitialized = true;
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
CV_UNUSED(inputs); // suppress unused variable warning
long flops = 0;
bool isPool1D = inputs[0].size() == 3;
size_t karea = std::accumulate(kernel_size.begin(), isPool1D? kernel_size.begin() + 1 : kernel_size.end(),
1, std::multiplies<size_t>());
for(int i = 0; i < outputs.size(); i++)
{
if (type == MAX)
{
if (i%2 == 0)
flops += total(outputs[i])*karea;
}
else
{
flops += total(outputs[i])*(karea + 1);
}
}
return flops;
}
private:
enum Type
{
MAX,
AVE,
STOCHASTIC,
SUM,
ROI, // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
PSROI // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
};
bool hasDynamicShapes;
bool shapesInitialized;
float multiplier;
};
Ptr<PoolingLayerInt8> PoolingLayerInt8::create(const LayerParams& params)
{
return Ptr<PoolingLayerInt8>(new PoolingLayerInt8Impl(params));
}
}
}

@ -0,0 +1,157 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
namespace cv
{
namespace dnn
{
class QuantizeLayerImpl CV_FINAL : public QuantizeLayer
{
public:
QuantizeLayerImpl(const LayerParams& params)
{
scale = params.get<float>("scales", 1.0f);
zeropoint = params.get<int>("zeropoints", 0);
setParamsFrom(params);
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
CV_Assert(inputs.size() == 1);
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
return false;
}
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
{
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
}
#ifdef HAVE_OPENCL
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
{
std::vector<UMat> inputs, outputs;
inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs);
if (inputs_.depth() == CV_16S)
{
UMat inputFp32(shape(inputs[0]), CV_32F);
convertFp16(inputs[0], inputFp32);
inputFp32.copyTo(inputs[0]);
}
inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint);
return true;
}
#endif
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint);
}
};
class DequantizeLayerImpl CV_FINAL : public DequantizeLayer
{
public:
DequantizeLayerImpl(const LayerParams& params)
{
scale = params.get<float>("scales", 1.0f);
zeropoint = params.get<int>("zeropoints", 0);
setParamsFrom(params);
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
CV_Assert(inputs.size() == 1);
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
return false;
}
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
{
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
}
#ifdef HAVE_OPENCL
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
{
std::vector<UMat> inputs, outputs;
inputs_.getUMatVector(inputs);
outputs_.getUMatVector(outputs);
UMat outputFp32(shape(outputs[0]), CV_32F);
inputs[0].convertTo(outputFp32, CV_32F, scale, -(scale*zeropoint));
if (outputs_.depth() == CV_16S)
convertFp16(outputFp32, outputs[0]);
else
outputFp32.copyTo(outputs[0]);
return true;
}
#endif
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
forward_ocl(inputs_arr, outputs_arr, internals_arr))
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
inputs[0].convertTo(outputs[0], CV_32F, scale, -(scale*zeropoint));
}
};
Ptr<QuantizeLayer> QuantizeLayer::create(const LayerParams& params)
{
return Ptr<QuantizeLayer>(new QuantizeLayerImpl(params));
}
Ptr<DequantizeLayer> DequantizeLayer::create(const LayerParams& params)
{
return Ptr<DequantizeLayer>(new DequantizeLayerImpl(params));
}
}
}

@ -0,0 +1,211 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <opencv2/imgproc.hpp>
#include <opencv2/dnn/shape_utils.hpp>
namespace cv
{
namespace dnn
{
class ScaleLayerInt8Impl CV_FINAL : public ScaleLayerInt8
{
public:
Mat weights, bias;
ScaleLayerInt8Impl(const LayerParams& params)
{
setParamsFrom(params);
hasBias = params.get<bool>("bias_term", false);
axis = params.get<int>("axis", 1);
hasWeights = false;
output_sc = params.get<float>("scales");
output_zp = params.get<int>("zeropoints");
DictValue inpSc = params.get("input_scales");
DictValue inpZp = params.get("input_zeropoints");
for (int i = 0; i < inpSc.size(); i++)
{
inp_sc.push_back(inpSc.get<float>(i));
inp_zp.push_back(inpZp.get<int>(i));
}
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
outputs.assign(1, inputs[0]);
return true;
}
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
{
std::vector<Mat> inputs;
inputs_arr.getMatVector(inputs);
hasWeights = blobs.size() == 2 || (blobs.size() <= 1 && !hasBias);
CV_Assert((inputs.size() == 2 && blobs.empty()) || blobs.size() == (int)hasWeights + (int)hasBias);
if (!blobs.empty())
{
Mat w = hasWeights ? blobs[0] : Mat::ones(blobs[0].size(), CV_32F);
Mat b = hasBias ? blobs.back() : Mat::zeros(blobs.back().size(), CV_32F);
w = w.reshape(1, 1);
b = b.reshape(1, 1);
w.convertTo(weights, CV_32F, inp_sc[0]/output_sc);
addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F);
}
else
{
// initialized during forward()
weights = Mat(); bias = Mat();
}
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
}
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
{
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
if (!activ_int8.empty())
{
return activ_int8->blobs.empty();
}
return false;
}
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
Mat &inpBlob = inputs[0];
Mat &outBlob = outputs[0];
if (blobs.empty())
{
CV_Assert(inp_sc.size() == 2 && inp_zp.size() == 2);
Mat inp_dequantized, w, b;
inputs[1].reshape(1, 1).convertTo(inp_dequantized, CV_32F, inp_sc[1], -(inp_sc[1]*inp_zp[1]));
w = hasWeights ? inp_dequantized : Mat::ones(inp_dequantized.size(), CV_32F);
b = hasBias ? inp_dequantized : Mat::zeros(inp_dequantized.size(), CV_32F);
w.convertTo(weights, CV_32F, inp_sc[0]/output_sc);
addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F);
}
MatShape inpShape = shape(inpBlob);
const int numWeights = weights.total();
CV_Assert(numWeights != 0);
CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs");
int endAxis;
for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis)
{
if (total(inpShape, axis, endAxis) == numWeights)
break;
}
CV_Assert(total(inpShape, axis, endAxis) == numWeights);
CV_CheckTypeEQ(inpBlob.type(), CV_8SC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_8SC1, "");
int numSlices = total(inpShape, 0, axis);
int8_t* inpData = (int8_t*)inpBlob.data;
int8_t* outData = (int8_t*)outBlob.data;
if (endAxis != inpBlob.dims)
{
float* weightsData = (float*)weights.data;
float* biasesData = (float*)bias.data;
int spatialSize = total(inpShape, endAxis); // spatialSize != 1
for (int i = 0; i < numSlices; ++i)
{
for (int j = 0; j < numWeights; ++j)
{
float w = weightsData[j];
float b = biasesData[j];
Mat inpSlice(1, spatialSize, CV_8S, inpData);
Mat outSlice(1, spatialSize, CV_8S, outData);
inpSlice.convertTo(outSlice, CV_8S, w, b);
inpData += spatialSize;
outData += spatialSize;
}
}
}
else
{
for (int i = 0; i < numSlices; ++i)
{
Mat inpSlice(1, numWeights, CV_8S, inpData);
Mat outSlice(1, numWeights, CV_8S, outData);
multiply(inpSlice, weights, outSlice, 1.0, CV_8S);
add(outSlice, bias, outSlice, Mat(), CV_8S);
inpData += numWeights;
outData += numWeights;
}
}
}
void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
{
scale = (hasWeights && !blobs.empty()) ? blobs[0] : Mat();
shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
}
void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE
{
scale = output_sc;
zeropoint = output_zp;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
CV_UNUSED(outputs); // suppress unused variable warning
long flops = 0;
for(int i = 0; i < inputs.size(); i++)
{
flops += 2*total(inputs[i]);
}
return flops;
}
private:
bool hasWeights;
std::vector<float> inp_sc;
std::vector<int> inp_zp;
};
Ptr<ScaleLayerInt8> ScaleLayerInt8::create(const LayerParams& params)
{
return Ptr<ScaleLayerInt8>(new ScaleLayerInt8Impl(params));
}
Ptr<Layer> ShiftLayerInt8::create(const LayerParams& params)
{
LayerParams scaleParams = params;
scaleParams.type = "ScaleInt8";
scaleParams.set("bias_term", true);
scaleParams.set("axis", 0);
return Ptr<ScaleLayerInt8>(new ScaleLayerInt8Impl(scaleParams));
}
} // namespace dnn
} // namespace cv

@ -0,0 +1,176 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include "../precomp.hpp"
#include "layers_common.hpp"
#include <algorithm>
#include <stdlib.h>
namespace cv
{
namespace dnn
{
class SoftMaxLayerInt8Impl CV_FINAL : public SoftmaxLayerInt8
{
public:
SoftMaxLayerInt8Impl(const LayerParams& params)
{
axisRaw = params.get<int>("axis", 1);
logSoftMax = params.get<bool>("log_softmax", false);
output_sc = params.get<float>("scales");
output_zp = params.get<int>("zeropoints");
setParamsFrom(params);
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
std::vector<MatShape> &internals) const CV_OVERRIDE
{
bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
MatShape shape = inputs[0];
int cAxis = normalize_axis(axisRaw, shape.size());
shape[cAxis] = 1;
internals.assign(1, shape);
return inplace;
}
virtual bool supportBackend(int backendId) CV_OVERRIDE
{
return backendId == DNN_BACKEND_OPENCV;
}
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
{
Ptr<DequantizeLayer> dequantize_layer = top.dynamicCast<DequantizeLayer>();
return !dequantize_layer.empty() && preferableTarget != DNN_TARGET_OPENCL_FP16;
}
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
std::vector<Mat> inputs, outputs, internals;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
internals_arr.getMatVector(internals);
const Mat &src = inputs[0];
Mat &dst = outputs[0];
int axis = normalize_axis(axisRaw, src.dims);
size_t outerSize = src.total(0, axis), channels = src.size[axis],
innerSize = src.total(axis + 1);
CV_Assert(src.type() == CV_8S && (dst.type() == CV_8S || dst.type() == CV_32F));
CV_Assert(src.isContinuous() && dst.isContinuous());
size_t outerStep = src.total(axis);
size_t cnStep = src.total(axis + 1);
const int8_t *srcPtr = src.ptr<int8_t>();
const float *expPtr = blobs[0].ptr<float>();
if (dst.type() == CV_32F)
{
float *dstPtr = dst.ptr<float>();
for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
{
size_t srcOffset = outerDim * outerStep;
std::vector<float> expSum(innerSize, 0.f);
// sum exp along axis
for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
expSum[i] += expPtr[srcPtr[offset + i] + 128];
}
// divide by computed sum
for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
dstPtr[offset + i] = expPtr[srcPtr[offset + i] + 128]/expSum[i];
}
if (logSoftMax)
{
for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
dstPtr[offset + i] = log(dstPtr[offset + i]);
}
}
}
}
else
{
const float inv_scale = 1.f/output_sc;
int8_t *dstPtr = dst.ptr<int8_t>();
for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
{
size_t srcOffset = outerDim * outerStep;
std::vector<float> expSum(innerSize, 0.f);
// sum exp along axis
for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
expSum[i] += expPtr[srcPtr[offset + i] + 128];
}
// divide by computed sum and quantize to int8
if (logSoftMax)
{
for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
dstPtr[offset + i] = saturate_cast<int8_t>(output_zp + std::round(inv_scale*log(expPtr[srcPtr[offset + i] + 128]/expSum[i])));
}
}
else
{
for (size_t cnDim = 0; cnDim < channels; cnDim++)
{
const int offset = srcOffset + cnDim * cnStep;
for (size_t i = 0; i < innerSize; i++)
dstPtr[offset + i] = saturate_cast<int8_t>(output_zp + std::round(inv_scale*(expPtr[srcPtr[offset + i] + 128]/expSum[i])));
}
}
}
}
}
int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
CV_UNUSED(outputs); // suppress unused variable warning
int64 flops = 0;
for (int i = 0; i < inputs.size(); i++)
{
flops += 4*total(inputs[i]);
}
return flops;
}
int axisRaw;
};
Ptr<SoftmaxLayerInt8> SoftmaxLayerInt8::create(const LayerParams& params)
{
return Ptr<SoftmaxLayerInt8>(new SoftMaxLayerInt8Impl(params));
}
}
}

@ -409,6 +409,18 @@ public:
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
params.set("input_scale", scales[0][0]);
params.set("input_zeropoint", zeropoints[0][0]);
params.blobs.clear();
params.blobs.push_back(origin_weights);
params.blobs.push_back(origin_bias);
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -166,6 +166,11 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
};
Ptr<Layer> BlankLayer::create(const LayerParams& params)

@ -70,6 +70,7 @@ public:
setParamsFrom(params);
axis = params.get<int>("axis", 1);
padding = params.get<bool>("padding", false);
paddingValue = params.get<int>("padding_value", 0);
}
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@ -119,13 +120,14 @@ public:
(backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding);
}
template <class T>
class ChannelConcatInvoker : public ParallelLoopBody
{
public:
std::vector<Mat>* inputs;
Mat* output;
int nstripes;
std::vector<const float*> chptrs;
std::vector<const T*> chptrs;
static void run(std::vector<Mat>& inputs, Mat& output, int nstripes)
{
@ -139,14 +141,14 @@ public:
for( i = 0; i < ninputs; i++ )
{
Mat& inp = inputs[i];
CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
inp.dims == 4 && inp.size[0] == output.size[0] &&
inp.size[2] == output.size[2] &&
inp.size[3] == output.size[3] );
nchannels += inp.size[1];
}
CV_Assert( nchannels == output.size[1] );
CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
cc.chptrs.resize(nchannels*batchsz);
@ -157,7 +159,7 @@ public:
for( int j = 0; j < batchsz; j++ )
for( int k = 0; k < inp.size[1]; k++ )
{
const float* ptr = inp.ptr<float>(j, k);
const T* ptr = inp.ptr<T>(j, k);
cc.chptrs[ofs + j*nchannels + k] = ptr;
}
ofs += inp.size[1];
@ -176,8 +178,8 @@ public:
size_t stripeSize = (total + nstripes - 1)/nstripes;
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(total, r.end*stripeSize);
const float** ptrs = (const float**)&chptrs[0];
float* outptr = output->ptr<float>();
const T** ptrs = (const T**)&chptrs[0];
T* outptr = output->ptr<T>();
size_t blockSize0 = 1 << 16;
for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
@ -248,7 +250,8 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
inputs_arr.depth() != CV_8S,
forward_ocl(inputs_arr, outputs_arr, internals_arr))
std::vector<Mat> inputs, outputs;
@ -259,12 +262,15 @@ public:
Mat& outMat = outputs[0];
if (padding)
outMat.setTo(0);
outMat.setTo(paddingValue);
if( cAxis == 1 && outMat.dims == 4 && !padding)
{
int nstripes = getNumThreads();
ChannelConcatInvoker::run(inputs, outMat, nstripes);
if (outMat.type() == CV_8S)
ChannelConcatInvoker<int8_t>::run(inputs, outMat, nstripes);
else
ChannelConcatInvoker<float>::run(inputs, outMat, nstripes);
}
else
{
@ -394,6 +400,14 @@ public:
return Ptr<BackendNode>(new InfEngineNgraphNode(concat));
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
if (padding)
params.set("padding_value", zeropoints[1][0]);
return true;
}
};
Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)

@ -112,6 +112,15 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
Mat quantizedBlob;
blobs[0].convertTo(quantizedBlob, CV_8S, 1.f/scales[1][0], zeropoints[1][0]);
params.blobs.clear();
params.blobs.push_back(quantizedBlob);
return true;
}
};
Ptr<Layer> ConstLayer::create(const LayerParams& params)

@ -2083,6 +2083,48 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
// References - https://arxiv.org/pdf/1712.05877.pdf
// Quantized convolution with variable weights is not supported.
if (blobs.empty())
return false;
float inputScale = scales[0][0], outputScale = scales[1][0];
int inputZp = zeropoints[0][0];
params.set("input_zeropoint", inputZp);
Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
Mat biasQuantized(1, numOutput, CV_32S);
Mat outputMultiplier(1, numOutput, CV_32F);
double realMin, realMax, weightsScale;
for( int i = 0; i < numOutput; i++ )
{
// Quantize weights
cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
realMin = std::min(realMin, 0.0);
realMax = std::max(realMax, 0.0);
weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
// Quantize biases
float biasScale = inputScale * weightsScale;
biasQuantized.at<int>(i) = (int)std::round(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
// Store multiplier
outputMultiplier.at<float>(i) = biasScale / outputScale;
}
params.blobs.clear();
params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
params.blobs.push_back(biasQuantized);
params.blobs.push_back(outputMultiplier);
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -255,6 +255,12 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return func.tryQuantize(scales, zeropoints, params);
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{
@ -288,6 +294,8 @@ struct BaseFunctor
bool tryFuse(Ptr<dnn::Layer>&) { return false; }
void getScaleShift(Mat&, Mat&) const {}
bool tryQuantize(const std::vector<std::vector<float>>&, const std::vector<std::vector<int>>&, LayerParams&) { return false; }
};
struct ReLUFunctor : public BaseFunctor
@ -436,6 +444,29 @@ struct ReLUFunctor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
if (slope != 0.f)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float y = x >= 0.f ? x : slope*x;
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
}
return true;
}
int64 getFLOPSPerElement() const { return 1; }
};
@ -559,6 +590,12 @@ struct ReLU6Functor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
return true;
}
int64 getFLOPSPerElement() const { return 2; }
};
@ -651,6 +688,26 @@ struct TanHFunctor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float y = tanh(x);
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
return true;
}
int64 getFLOPSPerElement() const { return 1; }
};
@ -743,6 +800,26 @@ struct SwishFunctor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float y = x / (1.0f + exp(-x));
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
return true;
}
int64 getFLOPSPerElement() const { return 3; }
};
@ -848,6 +925,28 @@ struct MishFunctor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float eX = exp(x);
float n = (eX + 2) * eX;
float y = (x * n) / (n + 2);
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
return true;
}
int64 getFLOPSPerElement() const { return 3; }
};
@ -940,6 +1039,26 @@ struct SigmoidFunctor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float y = 1.f/(1.f + exp(-x));
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
return true;
}
int64 getFLOPSPerElement() const { return 3; }
};
@ -1032,6 +1151,26 @@ struct ELUFunctor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float y = x >= 0.f ? x : exp(x) - 1;
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
return true;
}
int64 getFLOPSPerElement() const { return 2; }
};
@ -1130,6 +1269,26 @@ struct AbsValFunctor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float y = abs(x);
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
return true;
}
int64 getFLOPSPerElement() const { return 1; }
};
@ -1223,6 +1382,26 @@ struct BNLLFunctor : public BaseFunctor
}
#endif // HAVE_VULKAN
bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
{
float inpScale = scales[0][0], outScale = scales[1][0];
int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
Mat lookUpTable(1, 256, CV_8S);
int8_t* table = lookUpTable.ptr<int8_t>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - inpZp);
float y = x > 0 ? x + log(1. + exp(-x)) : log(1. + exp(x));
int quantized = outZp + (int)std::round(y/outScale);
table[i+128] = saturate_cast<int8_t>(quantized);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
return true;
}
int64 getFLOPSPerElement() const { return 5; }
};

@ -864,6 +864,37 @@ public:
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
if (op == SUM)
{
std::vector<float> newCoeffs;
float offset = zeropoints[1][0];
float out_sc = scales[1][0];
for (int i = 0; i < scales[0].size(); i++)
{
float coeff = coeffs.empty() ? 1.f : coeffs[i];
float newcoeff = (scales[0][i] * coeff) / out_sc;
newCoeffs.push_back(newcoeff);
offset -= (newcoeff * zeropoints[0][i]);
}
params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
params.set("offset", offset);
return true;
}
else if (op == PROD)
{
std::vector<float> newCoeffs = scales[0];
newCoeffs[0] /= scales[1][0];
params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
params.set("offset", zeropoints[1][0]);
params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
return true;
}
return op == MAX;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -227,6 +227,11 @@ virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inp
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
int _startAxis;
int _endAxis;

@ -618,6 +618,45 @@ public:
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
if (blobs.empty())
return false;
int numOutput = blobs[0].size[0];
float inputScale = scales[0][0], outputScale = scales[1][0];
int inputZp = zeropoints[0][0];
Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
Mat biasQuantized(1, numOutput, CV_32S);
Mat outputMultiplier(1, numOutput, CV_32F);
double realMin, realMax, weightsScale;
for( int i = 0; i < numOutput; i++ )
{
// Quantize weights
cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
realMin = std::min(realMin, 0.0);
realMax = std::max(realMax, 0.0);
weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
// Quantize biases
float biasScale = inputScale * weightsScale;
biasQuantized.at<int>(i) = (int)std::round(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
// Store multiplier
outputMultiplier.at<float>(i) = biasScale / outputScale;
}
params.blobs.clear();
params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
params.blobs.push_back(biasQuantized);
params.blobs.push_back(outputMultiplier);
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -134,6 +134,8 @@ public:
cv::convertFp16(paddingValue_fp32, paddingValue_fp16);
outputs[0].setTo(paddingValue_fp16[0]);
}
else if (inputs_arr.depth() == CV_8S)
outputs[0].setTo(saturate_cast<int8_t>(paddingValue));
else
outputs[0].setTo(paddingValue);
inputs[0].copyTo(outputs[0](dstRanges));
@ -264,6 +266,16 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
float outputScale = scales[1][0];
int outputZp = zeropoints[1][0];
float padValue = outputZp + std::round(params.get<float>("value", 0)/outputScale);
params.set("value", padValue);
return true;
}
private:
std::vector<std::pair<int, int> > paddings; // Pairs pad before, pad after.
std::vector<Range> dstRanges;

@ -194,6 +194,7 @@ public:
#endif
}
template <class T>
class PermuteInvoker : public ParallelLoopBody
{
public:
@ -229,7 +230,7 @@ public:
size_t stripeStart = r.start*stripeSize;
size_t stripeEnd = std::min(r.end*stripeSize, orows);
const size_t esz = sizeof(float);
const size_t esz = sizeof(T);
size_t ostep0 = out->step[0]/esz, ostep1 = out->step[1]/esz, ostep2 = out->step[2]/esz;
const size_t* ord = &order->at(0);
size_t istep0 = inp->step[ord[0]]/esz, istep1 = inp->step[ord[1]]/esz,
@ -241,13 +242,13 @@ public:
int i1 = (int)(val % n1);
int i0 = (int)(val / n1);
const float* inptr_orig = inp->ptr<float>();
float* outptr_orig = out->ptr<float>();
const T* inptr_orig = inp->ptr<T>();
T* outptr_orig = out->ptr<T>();
for( size_t ofs = stripeStart; ofs < stripeEnd; ofs++ )
{
const float* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2;
float* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2;
const T* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2;
T* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2;
for( int i3 = 0; i3 < n3; i3++ )
outptr[i3] = inptr[i3*istep3];
@ -321,7 +322,8 @@ public:
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
inputs_arr.depth() != CV_8S,
forward_ocl(inputs_arr, outputs_arr, internals_arr))
if (inputs_arr.depth() == CV_16S)
@ -365,24 +367,48 @@ public:
if( numAxes == 4 )
{
int nstripes = getNumThreads();
PermuteInvoker::run(inp, out, _order, nstripes);
if (inp.type() == CV_8S)
PermuteInvoker<int8_t>::run(inp, out, _order, nstripes);
else
PermuteInvoker<float>::run(inp, out, _order, nstripes);
}
else
{
const float *srcData = inp.ptr<float>();
float *dstData = out.ptr<float>();
if (inp.type() == CV_8S)
{
const int8_t *srcData = inp.ptr<int8_t>();
int8_t *dstData = out.ptr<int8_t>();
for (i = 0; i < count; ++i)
for (i = 0; i < count; ++i)
{
size_t oldPosition = 0;
size_t newPosition = i;
for (j = 0; j < numAxes; ++j)
{
oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
newPosition %= newStride[j];
}
dstData[i] = srcData[oldPosition];
}
}
else
{
size_t oldPosition = 0;
size_t newPosition = i;
const float *srcData = inp.ptr<float>();
float *dstData = out.ptr<float>();
for (j = 0; j < numAxes; ++j)
for (i = 0; i < count; ++i)
{
oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
newPosition %= newStride[j];
size_t oldPosition = 0;
size_t newPosition = i;
for (j = 0; j < numAxes; ++j)
{
oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
newPosition %= newStride[j];
}
dstData[i] = srcData[oldPosition];
}
dstData[i] = srcData[oldPosition];
}
}
}
@ -436,6 +462,11 @@ public:
}
#endif // HAVE_VULKAN
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
size_t _count;
std::vector<size_t> _order;

@ -1327,6 +1327,23 @@ public:
return true;
}
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
if (type == MAX && !computeMaxIdx)
{
return true;
}
else if (type == AVE || type == SUM)
{
float multiplier = scales[0][0] / scales[1][0];
params.set("multiplier", multiplier);
params.set("input_zeropoint", zeropoints[0][0]);
return true;
}
return false;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -231,6 +231,11 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE

@ -343,6 +343,11 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
private:
std::vector<MatShape> outShapes;

@ -344,6 +344,14 @@ public:
shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
}
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
return true;
}
virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -147,6 +147,12 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
return true;
}
private:
Ptr<PermuteLayer> permute;
std::vector<int> permuteInpShape, permuteOutShape;

@ -531,7 +531,12 @@ public:
{
std::vector<int> inpIdx(dimsNum, 0);
std::vector<int> outIdx(dimsNum, 0);
getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
if (inpMat.type() == CV_16S)
getSliceRecursive<int16_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
else if (inpMat.type() == CV_8S)
getSliceRecursive<int8_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
else
getSliceRecursive<float>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
}
}
}
@ -647,8 +652,20 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
const int numOutputs = scales[1].size();
for (int i = 0; i < numOutputs; i++)
{
if (scales[1][i] != scales[0][0])
return false;
}
return true;
}
private:
template <typename T>
void getSliceRecursive(const Mat &inpMat, std::vector<int> &inpIdx,
const std::vector<Range> &sliceRanges,
const std::vector<int> &sliceSteps, int dim, int dimsNum,
@ -658,8 +675,6 @@ private:
int end = sliceRanges[dim].end;
int step = !sliceSteps.empty() ? sliceSteps[dim] : 1;
const bool is32F = inpMat.depth() == CV_32F;
// TODO optimization is required (for 2D tail case at least)
for (int k = begin, j = 0; k < end; k += step, j++)
{
@ -667,14 +682,9 @@ private:
outIdx[dim] = j;
if (dim + 1 < dimsNum)
getSliceRecursive(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
getSliceRecursive<T>(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
else
{
if (is32F)
outputs.at<float>(outIdx.data()) = inpMat.at<float>(inpIdx.data());
else
outputs.at<short>(outIdx.data()) = inpMat.at<short>(inpIdx.data()); // 16F emulation
}
outputs.at<T>(outIdx.data()) = inpMat.at<T>(inpIdx.data());
}
}

@ -374,6 +374,22 @@ public:
}
#endif // HAVE_DNN_NGRAPH
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
float inpScale = scales[0][0];
Mat lookUpTable(1, 256, CV_32F);
float* table = lookUpTable.ptr<float>();
for (int i = -128; i < 128; i++)
{
float x = inpScale*(i - 127); // ensures exp(x) is always between (0, 1)
table[i+128] = std::exp(x);
}
params.blobs.clear();
params.blobs.push_back(lookUpTable);
return true;
}
int64 getFLOPS(const std::vector<MatShape> &inputs,
const std::vector<MatShape> &outputs) const CV_OVERRIDE
{

@ -117,6 +117,17 @@ public:
}
#endif
virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
{
const int numOutputs = scales[1].size();
for (int i = 0; i < numOutputs; i++)
{
if (scales[1][i] != scales[0][0])
return false;
}
return true;
}
};
Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save