mirror of https://github.com/opencv/opencv.git
Merge pull request #20228 from SamFC10:int8
commit
f787c49b53
36 changed files with 6400 additions and 66 deletions
@ -0,0 +1,178 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../precomp.hpp" |
||||
#include "layers_common.hpp" |
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
|
||||
class BatchNormLayerInt8Impl CV_FINAL : public BatchNormLayerInt8 |
||||
{ |
||||
public: |
||||
Mat origin_weights, origin_bias; |
||||
Mat weights_, bias_; |
||||
mutable int dims; |
||||
|
||||
BatchNormLayerInt8Impl(const LayerParams& params) |
||||
: dims(-1) |
||||
{ |
||||
setParamsFrom(params); |
||||
useGlobalStats = params.get<bool>("use_global_stats", true); |
||||
input_sc = params.get<float>("input_scale"); |
||||
input_zp = params.get<int>("input_zeropoint"); |
||||
output_sc = params.get<float>("scales"); |
||||
output_zp = params.get<int>("zeropoints"); |
||||
|
||||
CV_Assert(blobs.size() == 2); |
||||
size_t n = blobs[0].total(); |
||||
CV_Assert(blobs[1].total() == n && |
||||
blobs[0].isContinuous() && blobs[1].isContinuous() && |
||||
blobs[0].type() == CV_32F && blobs[1].type() == CV_32F); |
||||
|
||||
origin_weights = blobs[0]; |
||||
origin_bias = blobs[1]; |
||||
} |
||||
|
||||
virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE |
||||
{ |
||||
origin_weights.convertTo(weights_, CV_32F, input_sc/output_sc); |
||||
addWeighted(origin_bias, 1.0/output_sc, weights_, -input_zp, output_zp, bias_, CV_32F); |
||||
} |
||||
|
||||
void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE |
||||
{ |
||||
scale = origin_weights; |
||||
shift = origin_bias; |
||||
} |
||||
|
||||
void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE |
||||
{ |
||||
scale = output_sc; |
||||
zeropoint = output_zp; |
||||
} |
||||
|
||||
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE |
||||
{ |
||||
Mat w_, b_; |
||||
top->getScaleShift(w_, b_); |
||||
if (w_.empty() && b_.empty()) |
||||
return false; |
||||
|
||||
const int numChannels = weights_.total(); |
||||
const int numFusedWeights = w_.total(); |
||||
const int numFusedBias = b_.total(); |
||||
|
||||
if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w_.empty()) || |
||||
(numFusedBias != numChannels && numFusedBias != 1 && !b_.empty())) |
||||
return false; |
||||
|
||||
float new_sc; |
||||
int new_zp; |
||||
top->getScaleZeropoint(new_sc, new_zp); |
||||
|
||||
Mat w = numFusedWeights == 1 ? Mat(1, numChannels, CV_32F, Scalar(w_.at<float>(0))) : |
||||
(w_.empty() ? Mat::ones(1, numChannels, CV_32F) : w_.reshape(1, 1)); |
||||
|
||||
Mat b = numFusedBias == 1 ? Mat(1, numChannels, CV_32F, Scalar(b_.at<float>(0))) : |
||||
(b_.empty() ? Mat::zeros(1, numChannels, CV_32F) : b_.reshape(1, 1)); |
||||
|
||||
weights_ = Mat(); bias_ = Mat(); |
||||
multiply(origin_weights, w, weights_, input_sc/new_sc, CV_32F); |
||||
multiply(origin_bias, w, bias_); |
||||
add(bias_, b, bias_); |
||||
addWeighted(bias_, 1.0/new_sc, weights_, -input_zp, new_zp, bias_, CV_32F); |
||||
return true; |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &internals) const CV_OVERRIDE |
||||
{ |
||||
dims = inputs[0].size(); |
||||
if (!useGlobalStats && inputs[0][0] != 1) |
||||
CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1"); |
||||
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); |
||||
return true; |
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
return backendId == DNN_BACKEND_OPENCV; |
||||
} |
||||
|
||||
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE |
||||
{ |
||||
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>(); |
||||
if (!activ_int8.empty()) |
||||
{ |
||||
return activ_int8->blobs.empty(); |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); |
||||
|
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
|
||||
CV_Assert(blobs.size() == 2); |
||||
CV_Assert(inputs.size() == 1); |
||||
|
||||
Mat &inpBlob = inputs[0]; |
||||
int planeSize = 1; |
||||
for (size_t i = 2; i < inpBlob.dims; i++) { |
||||
planeSize *= inpBlob.size[i]; |
||||
} |
||||
|
||||
for (size_t ii = 0; ii < outputs.size(); ii++) |
||||
{ |
||||
Mat &outBlob = outputs[ii]; |
||||
|
||||
for(int num = 0; num < outBlob.size[0]; num++) |
||||
{ |
||||
for (int n = 0; n < outBlob.size[1]; n++) |
||||
{ |
||||
float w = weights_.at<float>(n); |
||||
float b = bias_.at<float>(n); |
||||
Mat inpBlobPlane(1, planeSize, CV_8S, inpBlob.ptr<int8_t>(num, n)); |
||||
Mat outBlobPlane(1, planeSize, CV_8S, outBlob.ptr<int8_t>(num, n)); |
||||
inpBlobPlane.convertTo(outBlobPlane, CV_8S, w, b); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
virtual int64 getFLOPS(const std::vector<MatShape> &inputs, |
||||
const std::vector<MatShape> &outputs) const CV_OVERRIDE |
||||
{ |
||||
CV_UNUSED(outputs); // suppress unused variable warning
|
||||
|
||||
int64 flops = 0; |
||||
for(int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
flops += 3*total(inputs[i]); |
||||
} |
||||
return flops; |
||||
} |
||||
|
||||
private: |
||||
bool useGlobalStats; |
||||
}; |
||||
|
||||
Ptr<BatchNormLayerInt8> BatchNormLayerInt8::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<BatchNormLayerInt8>(new BatchNormLayerInt8Impl(params)); |
||||
} |
||||
|
||||
} // namespace dnn
|
||||
} // namespace cv
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,190 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../precomp.hpp" |
||||
#include "layers_common.hpp" |
||||
|
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
#include <iostream> |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
|
||||
class ActivationLayerInt8Impl CV_FINAL : public ActivationLayerInt8 |
||||
{ |
||||
public: |
||||
ActivationLayerInt8Impl(const LayerParams ¶ms) |
||||
{ |
||||
setParamsFrom(params); |
||||
activationLUT = !blobs.empty() ? blobs[0] : Mat(); |
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
return backendId == DNN_BACKEND_OPENCV; |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &internals) const CV_OVERRIDE |
||||
{ |
||||
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); |
||||
return true; |
||||
} |
||||
|
||||
class Activation : public cv::ParallelLoopBody |
||||
{ |
||||
public: |
||||
const Mat* src; |
||||
const Mat* lut; |
||||
Mat* dst; |
||||
int nstripes; |
||||
|
||||
Activation() : src(0), lut(0), dst(0), nstripes(0){} |
||||
|
||||
static void run(const Mat& src, const Mat& lut, Mat& dst, int nstripes) |
||||
{ |
||||
Activation p; |
||||
|
||||
p.src = &src; |
||||
p.lut = &lut; |
||||
p.dst = &dst; |
||||
p.nstripes = nstripes; |
||||
|
||||
parallel_for_(Range(0, nstripes), p, nstripes); |
||||
} |
||||
|
||||
void operator()(const Range &r) const CV_OVERRIDE |
||||
{ |
||||
const int8_t* table = lut->ptr<int8_t>(); |
||||
int nsamples = 1, outCn = 1; |
||||
size_t planeSize = 1; |
||||
|
||||
if (src->dims > 1) |
||||
{ |
||||
nsamples = src->size[0]; |
||||
outCn = src->size[1]; |
||||
} |
||||
else |
||||
outCn = src->size[0]; |
||||
|
||||
for (int i = 2; i < src->dims; ++i) |
||||
planeSize *= src->size[i]; |
||||
|
||||
size_t stripeSize = (planeSize + nstripes - 1)/nstripes; |
||||
size_t stripeStart = r.start*stripeSize; |
||||
size_t stripeEnd = std::min(r.end*stripeSize, planeSize); |
||||
int len = (int)(stripeEnd - stripeStart); |
||||
|
||||
for( int i = 0; i < nsamples; i++ ) |
||||
{ |
||||
const int8_t* srcptr = src->ptr<int8_t>(i) + stripeStart; |
||||
int8_t* dstptr = dst->ptr<int8_t>(i) + stripeStart; |
||||
for( int cn = 0; cn < outCn; cn++, srcptr += planeSize, dstptr += planeSize ) |
||||
{ |
||||
int i = 0; |
||||
#if CV_SIMD128 |
||||
for( ; i <= len - 16; i += 16 ) |
||||
{ |
||||
v_int8x16 out(table[srcptr[i] + 128], table[srcptr[i+1] + 128], table[srcptr[i+2] + 128], table[srcptr[i+3] + 128], |
||||
table[srcptr[i+4] + 128], table[srcptr[i+5] + 128], table[srcptr[i+6] + 128], table[srcptr[i+7] + 128], |
||||
table[srcptr[i+8] + 128], table[srcptr[i+9] + 128], table[srcptr[i+10] + 128], table[srcptr[i+11] + 128], |
||||
table[srcptr[i+12] + 128], table[srcptr[i+13] + 128], table[srcptr[i+14] + 128], table[srcptr[i+15] + 128]); |
||||
v_store(dstptr + i, out); |
||||
} |
||||
#endif |
||||
for( ; i < len; i++ ) |
||||
{ |
||||
dstptr[i] = table[srcptr[i] + 128]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
|
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++) |
||||
{ |
||||
const Mat &src = inputs[i]; |
||||
if (!activationLUT.empty()) |
||||
{ |
||||
const int nstripes = getNumThreads(); |
||||
Mat &dst = outputs[i]; |
||||
CV_Assert(src.size == dst.size && src.type() == dst.type() && |
||||
src.isContinuous() && dst.isContinuous() && src.type() == CV_8S); |
||||
|
||||
Activation::run(src, activationLUT, dst, nstripes); |
||||
} |
||||
else |
||||
{ |
||||
src.copyTo(outputs[i]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE |
||||
{ |
||||
for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize ) |
||||
{ |
||||
int i = 0; |
||||
#if CV_SIMD128 |
||||
for( ; i <= len - 16; i += 16 ) |
||||
{ |
||||
v_int8x16 out(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128], |
||||
lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128], |
||||
lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128], |
||||
lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]); |
||||
v_store(dst + i, out); |
||||
} |
||||
#endif |
||||
for( ; i < len; i++ ) |
||||
dst[i] = lut[src[i] + 128]; |
||||
} |
||||
} |
||||
|
||||
void forwardSlice(const int* src, const int* lut, int* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE |
||||
{ |
||||
for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize ) |
||||
{ |
||||
int i = 0; |
||||
#if CV_SIMD128 |
||||
for( ; i <= len - 16; i += 16 ) |
||||
{ |
||||
v_int32x4 out0(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128]); |
||||
v_int32x4 out1(lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128]); |
||||
v_int32x4 out2(lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128]); |
||||
v_int32x4 out3(lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]); |
||||
|
||||
v_store(dst + i, out0); |
||||
v_store(dst + i + 4, out1); |
||||
v_store(dst + i + 8, out2); |
||||
v_store(dst + i + 12, out3); |
||||
} |
||||
#endif |
||||
for( ; i < len; i++ ) |
||||
dst[i] = lut[src[i] + 128]; |
||||
} |
||||
|
||||
} |
||||
|
||||
Mat activationLUT; |
||||
}; |
||||
|
||||
Ptr<ActivationLayerInt8> ActivationLayerInt8::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<ActivationLayerInt8>(new ActivationLayerInt8Impl(params)); |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,577 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../precomp.hpp" |
||||
#include "layers_common.hpp" |
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
|
||||
class EltwiseLayerInt8Impl CV_FINAL : public EltwiseLayerInt8 |
||||
{ |
||||
public: |
||||
enum EltwiseOp |
||||
{ |
||||
PROD = 0, |
||||
SUM = 1, |
||||
MAX = 2 |
||||
} op; |
||||
std::vector<float> coeffs; |
||||
std::vector<int> zeropoints; |
||||
|
||||
enum OutputChannelsMode |
||||
{ |
||||
ELTWISE_CHANNNELS_SAME = 0, //!< number of channels from inputs must be the same and equal to output's number of channels
|
||||
ELTWISE_CHANNNELS_INPUT_0, //!< number of channels from inputs may be different,
|
||||
//!< output's number of channels is equal to number of channels of first input
|
||||
//!< number of channels of other inputs should not be greater than number of channels of first input
|
||||
ELTWISE_CHANNNELS_INPUT_0_TRUNCATE, //!< number of channels from inputs may be different,
|
||||
//!< output's number of channels is equal to number of channels of first input
|
||||
//!< there is restriction on number of channels of other inputs
|
||||
//!< extra channels of other inputs is ignored
|
||||
ELTWISE_CHANNNELS_USE_MAX, //!< number of channels from inputs may be different,
|
||||
//!< output's number of channels is equal to maximal number of input channels
|
||||
//!< @note supported operation: `SUM`
|
||||
} channelsModeInput; |
||||
|
||||
|
||||
mutable OutputChannelsMode channelsMode; //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
|
||||
mutable /*size_t*/int outputChannels; |
||||
|
||||
EltwiseLayerInt8Impl(const LayerParams& params) |
||||
: outputChannels(0) |
||||
{ |
||||
setParamsFrom(params); |
||||
offset = params.get<float>("offset", 0.f); |
||||
hasVecInput = false; |
||||
op = SUM; |
||||
if (params.has("operation")) |
||||
{ |
||||
String operation = toLowerCase(params.get<String>("operation")); |
||||
if (operation == "prod") |
||||
op = PROD; |
||||
else if (operation == "sum") |
||||
op = SUM; |
||||
else if (operation == "max") |
||||
op = MAX; |
||||
else |
||||
CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\""); |
||||
} |
||||
|
||||
if (params.has("coeff")) |
||||
{ |
||||
DictValue paramCoeff = params.get("coeff"); |
||||
int i, n = paramCoeff.size(); |
||||
coeffs.resize(n); |
||||
for (i = 0; i < n; i++) |
||||
{ |
||||
coeffs[i] = paramCoeff.get<float>(i); |
||||
} |
||||
} |
||||
|
||||
if (params.has("input_zeropoints")) |
||||
{ |
||||
DictValue zp = params.get("input_zeropoints"); |
||||
int i, n = zp.size(); |
||||
zeropoints.resize(n); |
||||
for (i = 0; i < n; i++) |
||||
{ |
||||
zeropoints[i] = zp.get<int>(i); |
||||
} |
||||
} |
||||
|
||||
channelsModeInput = ELTWISE_CHANNNELS_SAME; |
||||
if (params.has("output_channels_mode")) |
||||
{ |
||||
String v = toLowerCase(params.get<String>("output_channels_mode")); |
||||
if (v == "same") |
||||
{ |
||||
channelsModeInput = ELTWISE_CHANNNELS_SAME; |
||||
} |
||||
else if (v == "input_0") |
||||
{ |
||||
channelsModeInput = ELTWISE_CHANNNELS_INPUT_0; |
||||
} |
||||
else if (v == "input_0_truncate") |
||||
{ |
||||
channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE; |
||||
} |
||||
else if (v == "max_input_channels") |
||||
{ |
||||
channelsModeInput = ELTWISE_CHANNNELS_USE_MAX; |
||||
if (op != SUM) |
||||
CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only"); |
||||
} |
||||
else |
||||
CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\""); |
||||
} |
||||
channelsMode = channelsModeInput; |
||||
|
||||
// TODO Must have checks for other unknown options
|
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
return backendId == DNN_BACKEND_OPENCV; |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &internals) const CV_OVERRIDE |
||||
{ |
||||
CV_Assert(inputs.size() >= 2); |
||||
CV_Assert(inputs[0].size() >= 2); |
||||
CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size()); |
||||
CV_Assert(op == SUM || op == PROD || coeffs.size() == 0); |
||||
|
||||
int dims = inputs[0].size(); |
||||
// Number of channels in output shape is determined by the first input tensor.
|
||||
bool variableChannels = false; |
||||
int numChannels = inputs[0][1]; |
||||
for (size_t i = 1; i < inputs.size(); i++) |
||||
{ |
||||
CV_Assert(inputs[0][0] == inputs[i][0]); // batch sizes are equal
|
||||
|
||||
int input_channels = inputs[i][1]; |
||||
if (numChannels != input_channels) |
||||
variableChannels = true; |
||||
|
||||
if (channelsModeInput == ELTWISE_CHANNNELS_SAME) |
||||
{ |
||||
CV_Assert(numChannels == input_channels); |
||||
} |
||||
else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0) |
||||
{ |
||||
CV_Assert(numChannels >= input_channels); |
||||
} |
||||
else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE) |
||||
{ |
||||
// nothing to check
|
||||
} |
||||
else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX) |
||||
{ |
||||
numChannels = std::max(numChannels, input_channels); |
||||
} |
||||
else |
||||
{ |
||||
CV_Assert(0 && "Internal error"); |
||||
} |
||||
} |
||||
|
||||
channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME; |
||||
outputChannels = numChannels; |
||||
|
||||
outputs.assign(1, inputs[0]); |
||||
outputs[0][1] = numChannels; |
||||
|
||||
if (dims > 2) |
||||
{ |
||||
size_t vecIdx = 0; |
||||
bool isVecFound = false; |
||||
for (size_t i = 0; i < inputs.size(); i++) |
||||
{ |
||||
bool allOnes = isAllOnes(inputs[i], 2, dims); |
||||
if (!allOnes && !isVecFound) |
||||
{ |
||||
vecIdx = i; |
||||
isVecFound = true; |
||||
} |
||||
|
||||
if (!allOnes && i != vecIdx) |
||||
{ |
||||
for (size_t j = 2; j < dims; j++) |
||||
{ |
||||
CV_Assert(inputs[vecIdx][j] == inputs[i][j]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
if (channelsModeInput == ELTWISE_CHANNNELS_SAME && isVecFound) |
||||
{ |
||||
for (size_t j = 2; j < dims; j++) |
||||
{ |
||||
outputs[0][j] = inputs[vecIdx][j]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
return false; |
||||
} |
||||
|
||||
void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE |
||||
{ |
||||
std::vector<Mat> inputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++) |
||||
{ |
||||
MatShape inpShape = shape(inputs[i].size); |
||||
if (isAllOnes(inpShape, 2, inputs[i].dims)) |
||||
{ |
||||
hasVecInput = true; |
||||
return; |
||||
} |
||||
} |
||||
} |
||||
|
||||
class EltwiseInvoker : public ParallelLoopBody |
||||
{ |
||||
EltwiseLayerInt8Impl& self; |
||||
std::vector<const Mat*> srcs; |
||||
std::vector<int> srcNumChannels; |
||||
int nsrcs; |
||||
Mat* dst; |
||||
Mat* buf; |
||||
std::vector<float> coeffs; |
||||
std::vector<int> zeropoints; |
||||
int nstripes; |
||||
const Mat* activLUT; |
||||
const ActivationLayerInt8* activ; |
||||
int channels; |
||||
size_t planeSize; |
||||
float offset; |
||||
|
||||
EltwiseInvoker(EltwiseLayerInt8Impl& self_) |
||||
: self(self_) |
||||
, nsrcs(0), dst(0), buf(0), nstripes(0), activ(0), channels(0) |
||||
, planeSize(0), offset(0) |
||||
{} |
||||
|
||||
public: |
||||
static void run(EltwiseLayerInt8Impl& self, |
||||
const Mat* srcs, int nsrcs, Mat& buf, Mat& dst, |
||||
int nstripes, float offset) |
||||
{ |
||||
const EltwiseOp op = self.op; |
||||
CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_8SC1, ""); CV_Assert(dst.isContinuous()); |
||||
CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs); |
||||
CV_CheckGE(nsrcs, 2, ""); |
||||
|
||||
CV_Assert(self.outputChannels == dst.size[1]); |
||||
|
||||
EltwiseInvoker p(self); |
||||
p.srcs.resize(nsrcs); |
||||
p.srcNumChannels.resize(nsrcs); |
||||
p.coeffs = self.coeffs; // can be sorted
|
||||
p.zeropoints = self.zeropoints; |
||||
|
||||
bool sortInputs = false; |
||||
for( int i = 0; i < nsrcs; i++ ) |
||||
{ |
||||
p.srcs[i] = &srcs[i]; |
||||
CV_CheckEQ(srcs[i].dims, dst.dims, ""); |
||||
CV_Assert(srcs[i].isContinuous()); |
||||
CV_Assert(srcs[i].type() == dst.type()); |
||||
p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1; |
||||
|
||||
if (self.channelsMode == ELTWISE_CHANNNELS_SAME) |
||||
{ |
||||
CV_Assert(srcs[i].size == dst.size); |
||||
} |
||||
else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0) |
||||
{ |
||||
if (i == 0) |
||||
CV_Assert(srcs[0].size == dst.size); |
||||
CV_Assert(self.outputChannels >= p.srcNumChannels[i]); |
||||
sortInputs = true; |
||||
} |
||||
else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE) |
||||
{ |
||||
if (i == 0) |
||||
CV_Assert(srcs[0].size == dst.size); |
||||
sortInputs = true; |
||||
} |
||||
else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX) |
||||
{ |
||||
CV_Assert(op == SUM); |
||||
CV_Assert(self.outputChannels >= p.srcNumChannels[i]); |
||||
sortInputs = true; |
||||
} |
||||
else |
||||
{ |
||||
CV_Assert(0 && "Internal error"); |
||||
} |
||||
|
||||
if (sortInputs) |
||||
{ |
||||
// Sort srcs and coefficients in the desc order by number of channels
|
||||
for (int j = i; j >= 1; j--) |
||||
{ |
||||
if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1])) |
||||
{ |
||||
std::swap(p.srcs[j - 1], p.srcs[j]); |
||||
std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]); |
||||
if (!p.coeffs.empty()) |
||||
std::swap(p.coeffs[j - 1], p.coeffs[j]); |
||||
if (!p.zeropoints.empty()) |
||||
std::swap(p.zeropoints[j - 1], p.zeropoints[j]); |
||||
} |
||||
else |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
|
||||
p.nsrcs = nsrcs; |
||||
p.dst = &dst; |
||||
p.buf = &buf; |
||||
p.nstripes = nstripes; |
||||
p.offset = offset; |
||||
p.channels = (dst.dims >= 4 ? dst.size[1] : 1); |
||||
|
||||
p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1); |
||||
CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, ""); |
||||
p.activLUT = &self.activationLUT; |
||||
p.activ = !self.activationLUT.empty() ? self.activ.get() : 0; |
||||
|
||||
parallel_for_(Range(0, nstripes), p, nstripes); |
||||
} |
||||
|
||||
void operator()(const Range& r) const CV_OVERRIDE |
||||
{ |
||||
const EltwiseOp op = self.op; |
||||
size_t total = dst->size[0]*planeSize; |
||||
size_t stripeSize = (total + nstripes - 1)/nstripes; |
||||
size_t stripeStart = r.start*stripeSize; |
||||
size_t stripeEnd = std::min(r.end*stripeSize, total); |
||||
const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0; |
||||
const int* zeropointsptr = !zeropoints.empty() ? &zeropoints[0] : 0; |
||||
const int8_t* lutptr = !activLUT->empty() ? activLUT->ptr<int8_t>() : 0; |
||||
int8_t* dstptr0 = dst->ptr<int8_t>(); |
||||
float* bufptr0 = buf->ptr<float>(); |
||||
int blockSize0 = 1 << 12; |
||||
|
||||
for (size_t ofs = stripeStart; ofs < stripeEnd; ) |
||||
{ |
||||
int sampleIdx = (int)(ofs / planeSize); |
||||
int delta = (int)ofs - sampleIdx * planeSize; |
||||
int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta)); |
||||
if( blockSize <= 0 ) |
||||
break; |
||||
ofs += blockSize; |
||||
|
||||
for (int c = 0; c < channels; c++) |
||||
{ |
||||
size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize; |
||||
int8_t* dstptr = dstptr0 + dstIdx; |
||||
float* bufptr = bufptr0 + dstIdx; |
||||
|
||||
// process first two inputs
|
||||
{ |
||||
const int8_t* srcptr0 = srcs[0]->ptr<int8_t>() + dstIdx; |
||||
|
||||
const int inputIdx = 1; |
||||
int src1_channels = srcNumChannels[inputIdx]; |
||||
if (c >= src1_channels) |
||||
{ |
||||
// no data from second input
|
||||
if (!coeffsptr) |
||||
{ |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
dstptr[j] = srcptr0[j]; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
float c0 = coeffsptr[0]; |
||||
int z0 = op == PROD ? zeropointsptr[0] : 0; |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
bufptr[j] = c0 * (srcptr0[j] - z0); |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize; |
||||
const int8_t* srcptrI = srcs[inputIdx]->ptr<int8_t>() + srcIdx; |
||||
|
||||
if (op == PROD) |
||||
{ |
||||
float c0 = coeffsptr[0]; |
||||
float c1 = coeffsptr[1]; |
||||
int z0 = zeropointsptr[0]; |
||||
int z1 = zeropointsptr[1]; |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
bufptr[j] = (c0*(srcptr0[j] - z0)) * (c1*(srcptrI[j] - z1)); |
||||
} |
||||
} |
||||
else if (op == MAX) |
||||
{ |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
dstptr[j] = std::max(srcptr0[j], srcptrI[j]); |
||||
} |
||||
} |
||||
else if (op == SUM) |
||||
{ |
||||
float c0 = coeffsptr[0]; |
||||
float c1 = coeffsptr[1]; |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
bufptr[j] = c0*srcptr0[j] + c1*srcptrI[j]; |
||||
} |
||||
} |
||||
else |
||||
CV_Error(Error::StsInternal, ""); |
||||
} |
||||
} |
||||
|
||||
// aggregate other inputs (3+)
|
||||
for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++) |
||||
{ |
||||
int srcI_channels = srcNumChannels[inputIdx]; |
||||
if (c >= srcI_channels) |
||||
continue; // no data from second input
|
||||
size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize; |
||||
const int8_t* srcptrI = srcs[inputIdx]->ptr<int8_t>() + srcIdx; |
||||
|
||||
if (op == PROD) |
||||
{ |
||||
float cI = coeffsptr[inputIdx]; |
||||
int zI = zeropointsptr[inputIdx]; |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
bufptr[j] *= cI*(srcptrI[j] - zI); |
||||
} |
||||
} |
||||
else if (op == MAX) |
||||
{ |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
dstptr[j] = std::max(dstptr[j], srcptrI[j]); |
||||
} |
||||
} |
||||
else if (op == SUM) |
||||
{ |
||||
float cI = coeffsptr[inputIdx]; |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
bufptr[j] += cI * srcptrI[j]; |
||||
} |
||||
} |
||||
else |
||||
CV_Error(Error::StsInternal, ""); |
||||
} |
||||
|
||||
// add offset and saturate cast to int8
|
||||
if (op == SUM || op == PROD) |
||||
{ |
||||
for (int j = 0; j < blockSize; j++) |
||||
{ |
||||
dstptr[j] = saturate_cast<int8_t>(std::round(bufptr[j] + offset)); |
||||
} |
||||
} |
||||
} |
||||
if( activ ) |
||||
{ |
||||
int8_t* ptr = dstptr0 + delta + sampleIdx*channels*planeSize; |
||||
activ->forwardSlice(ptr, lutptr, ptr, blockSize, planeSize, 0, channels); |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); |
||||
|
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
|
||||
CV_Assert(outputs.size() == 1); |
||||
const int nstripes = getNumThreads(); |
||||
|
||||
if (channelsModeInput == ELTWISE_CHANNNELS_SAME && inputs[0].dims > 2) |
||||
{ |
||||
for (size_t i = 0; i < inputs.size(); i++) |
||||
{ |
||||
MatShape inpShape = shape(inputs[i].size); |
||||
bool allOnes = isAllOnes(inpShape, 2, inputs[i].dims); |
||||
|
||||
if (allOnes) |
||||
{ |
||||
Mat tmpInput = inputs[i]; |
||||
MatShape outShape = shape(outputs[0].size); |
||||
size_t xSize = outShape[2]; |
||||
for (size_t j = 3; j < outShape.size(); j++) |
||||
xSize *= outShape[j]; |
||||
|
||||
int dimVec[3] = {outShape[0], outShape[1], (int) xSize}; |
||||
std::vector<int> matSizesVec(&dimVec[0], &dimVec[0] + 3); |
||||
inputs[i] = Mat(matSizesVec, tmpInput.type()); |
||||
|
||||
std::vector<int> idx(outShape.size(), 0); |
||||
std::vector<int> outIdx(inpShape.size(), 0); |
||||
|
||||
for (size_t j = 0; j < outShape[0]; j++) |
||||
{ |
||||
outIdx[0] = idx[0] = j; |
||||
for(size_t k = 0; k < outShape[1]; k++) |
||||
{ |
||||
outIdx[1] = idx[1] = k; |
||||
for (size_t x = 0; x < xSize; x++) |
||||
{ |
||||
outIdx[2] = x; |
||||
inputs[i].at<int8_t>(outIdx.data()) = tmpInput.at<int8_t>(idx.data()); |
||||
} |
||||
} |
||||
} |
||||
inputs[i] = inputs[i].reshape(0, outShape); |
||||
} |
||||
} |
||||
} |
||||
|
||||
Mat buf = Mat(shape(outputs[0]), CV_32F); // to store intermediate results
|
||||
EltwiseInvoker::run(*this, &inputs[0], (int)inputs.size(), buf, outputs[0], nstripes, offset); |
||||
} |
||||
|
||||
virtual int64 getFLOPS(const std::vector<MatShape> &inputs, |
||||
const std::vector<MatShape> &outputs) const CV_OVERRIDE |
||||
{ |
||||
CV_UNUSED(outputs); // suppress unused variable warning
|
||||
CV_Assert(inputs.size()); |
||||
|
||||
// FIXIT: handle inputs with different number of channels
|
||||
long flops = inputs.size() * total(inputs[0]); |
||||
|
||||
return flops; |
||||
} |
||||
|
||||
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE |
||||
{ |
||||
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>(); |
||||
if (!activ_int8.empty()) |
||||
{ |
||||
activ = activ_int8; |
||||
if (!activ_int8->blobs.empty()) |
||||
activationLUT = activ_int8->blobs[0]; |
||||
return true; |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
Mat activationLUT; |
||||
Ptr<ActivationLayerInt8> activ; |
||||
|
||||
private: |
||||
bool hasVecInput; |
||||
float offset; |
||||
}; |
||||
|
||||
Ptr<EltwiseLayerInt8> EltwiseLayerInt8::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<EltwiseLayerInt8>(new EltwiseLayerInt8Impl(params)); |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,266 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../precomp.hpp" |
||||
#include "layers_common.hpp" |
||||
|
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
|
||||
class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8 |
||||
{ |
||||
public: |
||||
enum { VEC_ALIGN = 32 }; |
||||
FullyConnectedLayerInt8Impl(const LayerParams& params) |
||||
{ |
||||
setParamsFrom(params); |
||||
output_zp = params.get<int>("zeropoints"); |
||||
axis = params.get<int>("axis", 1); |
||||
if (blobs.size() == 3) |
||||
{ |
||||
// blobs[0] - Weights
|
||||
// blobs[1] - Bias fused with offset
|
||||
// blobs[2] - Multipliers for output stage
|
||||
int numOutput = params.get<int>("num_output"); |
||||
int innerSize = (int)blobs[0].total() / numOutput; |
||||
|
||||
CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total()); |
||||
CV_Assert((size_t)numOutput == blobs[1].total()); |
||||
|
||||
weightsMat = blobs[0] = blobs[0].reshape(1, numOutput); |
||||
int vecsize = weightsMat.cols; |
||||
if (vecsize % VEC_ALIGN != 0) |
||||
{ |
||||
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN); |
||||
Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type()); |
||||
Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned); |
||||
wpadding.setTo(Scalar::all(0)); |
||||
weightsMat = weightsBuf.colRange(0, vecsize); |
||||
blobs[0].copyTo(weightsMat); |
||||
} |
||||
biasMat = blobs[1] = blobs[1].reshape(1, 1); |
||||
outputMultiplier = blobs[2]; |
||||
} |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &) const CV_OVERRIDE |
||||
{ |
||||
int numOutput, cAxis; |
||||
CV_CheckEQ(inputs.size(), (size_t)1, ""); |
||||
CV_CheckEQ(blobs[0].dims, 2, ""); |
||||
numOutput = blobs[0].size[0]; |
||||
CV_Assert((size_t)numOutput == blobs[1].total()); |
||||
cAxis = normalize_axis(axis, inputs[0]); |
||||
|
||||
MatShape outShape(cAxis + 1); |
||||
for (int i = 0; i < cAxis; ++i) |
||||
outShape[i] = inputs[0][i]; |
||||
outShape.back() = numOutput; |
||||
|
||||
outputs.resize(1, outShape); |
||||
return false; |
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
return backendId == DNN_BACKEND_OPENCV; |
||||
} |
||||
|
||||
virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE |
||||
{ |
||||
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>(); |
||||
if (!activ_int8.empty()) |
||||
{ |
||||
activ = activ_int8; |
||||
if (!activ_int8->blobs.empty()) |
||||
activ_int8->blobs[0].convertTo(activationLUT, CV_32S); |
||||
return true; |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
class FullyConnected : public ParallelLoopBody |
||||
{ |
||||
public: |
||||
FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0), |
||||
dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {} |
||||
|
||||
static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier, |
||||
const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp) |
||||
{ |
||||
CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols && |
||||
dstMat.rows == srcMat.rows && dstMat.cols == weights.rows && |
||||
srcMat.type() == weights.type() && srcMat.type() == CV_8S && |
||||
dstMat.type() == CV_32S && biasMat.type() == CV_32S && |
||||
biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols ); |
||||
|
||||
FullyConnected p; |
||||
|
||||
p.srcMat = &srcMat; |
||||
p.weights = &weights; |
||||
p.biasMat = &biasMat; |
||||
p.outputMultiplier = &outputMultiplier; |
||||
p.activationLUT = &activationLUT; |
||||
p.dstMat = &dstMat; |
||||
p.nstripes = nstripes; |
||||
p.outZp = outZp; |
||||
p.activ = !activationLUT.empty() ? activ : 0; |
||||
p.useAVX2 = checkHardwareSupport(CPU_AVX2); |
||||
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; |
||||
|
||||
parallel_for_(Range(0, nstripes), p, nstripes); |
||||
} |
||||
|
||||
void operator()(const Range& r) const CV_OVERRIDE |
||||
{ |
||||
int valign = FullyConnectedLayerInt8Impl::VEC_ALIGN; |
||||
int nsamples = srcMat->rows; |
||||
int nw0 = weights->rows; |
||||
int k, vecsize = srcMat->cols; |
||||
int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN); |
||||
size_t total = (size_t)nsamples*nw0; |
||||
size_t stripeSize = (total + nstripes - 1)/nstripes; |
||||
size_t stripeStart = r.start*stripeSize; |
||||
size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total); |
||||
size_t wstep = weights->step1(); |
||||
AutoBuffer<int8_t> srcbuf(vecsize_aligned + valign); |
||||
int8_t* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(int8_t))); |
||||
const int* lutptr = !activationLUT->empty() ? activationLUT->ptr<int>() : 0; |
||||
|
||||
for( k = vecsize; k < vecsize_aligned; k++ ) |
||||
sptr[k] = 0; |
||||
|
||||
for( size_t ofs = stripeStart; ofs < stripeEnd; ) |
||||
{ |
||||
int sampleIdx = (int)(ofs / nw0); |
||||
int delta = (int)(ofs - (size_t)sampleIdx*nw0); |
||||
const int8_t* sptr_ = srcMat->ptr<int8_t>(sampleIdx); |
||||
const int8_t* wptr = weights->ptr<int8_t>(delta); |
||||
int* dptr = dstMat->ptr<int>(sampleIdx) + delta; |
||||
const int* biasptr = biasMat->ptr<int>() + delta; |
||||
const float* multptr = outputMultiplier->ptr<float>() + delta; |
||||
int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs)); |
||||
|
||||
memcpy(sptr, sptr_, vecsize*sizeof(sptr[0])); |
||||
#if CV_TRY_AVX512_SKX |
||||
if( useAVX512 ) |
||||
opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); |
||||
else |
||||
#endif |
||||
#if CV_TRY_AVX2 |
||||
if( useAVX2 ) |
||||
opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); |
||||
else |
||||
#endif |
||||
{ |
||||
int i = 0; |
||||
#if CV_SIMD |
||||
for( ; i <= nw - 4; i += 4, wptr += 4*wstep ) |
||||
{ |
||||
v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(), |
||||
vs2 = v_setzero_s32(), vs3 = v_setzero_s32(); |
||||
v_int32x4 outzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127); |
||||
v_int32x4 s = v_load(biasptr + i); |
||||
v_float32x4 mult = v_load(multptr + i); |
||||
|
||||
for( k = 0; k < vecsize; k += 16 ) |
||||
{ |
||||
v_int8x16 v = v_load_aligned(sptr + k); |
||||
vs0 = v_dotprod_expand_fast(v, v_load_aligned(wptr + k), vs0); |
||||
vs1 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep + k), vs1); |
||||
vs2 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*2 + k), vs2); |
||||
vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3); |
||||
} |
||||
|
||||
s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3)); |
||||
v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult); |
||||
v_store(dptr + i, v_min(v_max(out, outmin), outmax)); |
||||
} |
||||
#endif |
||||
|
||||
for( ; i < nw; i++, wptr += wstep ) |
||||
{ |
||||
int s0 = biasptr[i]; |
||||
float mult0 = multptr[i]; |
||||
|
||||
for( k = 0; k < vecsize; k++ ) |
||||
{ |
||||
int8_t v = sptr[k]; |
||||
s0 += (int)v*wptr[k]; |
||||
} |
||||
int out0 = outZp + (int)std::round(s0*mult0); |
||||
dptr[i] = std::min(std::max(out0, -128), 127); |
||||
} |
||||
} |
||||
|
||||
if(activ) |
||||
activ->forwardSlice(dptr, lutptr, dptr, 1, 1, delta, delta + nw); |
||||
|
||||
ofs += nw; |
||||
} |
||||
} |
||||
|
||||
const Mat *srcMat, *weights, *biasMat, *outputMultiplier, *activationLUT; |
||||
const ActivationLayerInt8* activ; |
||||
Mat* dstMat; |
||||
int nstripes, outZp; |
||||
bool useAVX2; |
||||
bool useAVX512; |
||||
}; |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); |
||||
|
||||
std::vector<Mat> input, output; |
||||
inputs_arr.getMatVector(input); |
||||
outputs_arr.getMatVector(output); |
||||
|
||||
int axisCan = normalize_axis(axis, input[0].dims); |
||||
int outerSize = input[0].total(0, axisCan); |
||||
Mat srcMat = input[0].reshape(1, outerSize); |
||||
|
||||
Mat dstMat = output[0].reshape(1, outerSize); |
||||
Mat dstMatInt32= Mat(shape(dstMat), CV_32S); |
||||
|
||||
const int nstripes = getNumThreads(); |
||||
FullyConnected::run(srcMat, weightsMat, biasMat, outputMultiplier, activationLUT, dstMatInt32, activ.get(), nstripes, output_zp); |
||||
dstMatInt32.convertTo(dstMat, CV_8S); |
||||
} |
||||
|
||||
virtual int64 getFLOPS(const std::vector<MatShape> &inputs, |
||||
const std::vector<MatShape> &outputs) const CV_OVERRIDE |
||||
{ |
||||
CV_UNUSED(inputs); // suppress unused variable warning
|
||||
long flops = 0; |
||||
|
||||
int innerSize = blobs[0].size[1]; |
||||
for(int i = 0; i < outputs.size(); i++) |
||||
{ |
||||
flops += CV_BIG_INT(3)*innerSize*total(outputs[i]); |
||||
} |
||||
|
||||
return flops; |
||||
|
||||
} |
||||
|
||||
Mat weightsMat, biasMat, outputMultiplier, activationLUT; |
||||
Ptr<ActivationLayerInt8> activ; |
||||
}; |
||||
|
||||
Ptr<InnerProductLayerInt8> InnerProductLayerInt8::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<InnerProductLayerInt8>(new FullyConnectedLayerInt8Impl(params)); |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,41 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__ |
||||
#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__ |
||||
#include <opencv2/dnn.hpp> |
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
|
||||
#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY |
||||
// dispatched AVX/AVX2 optimizations
|
||||
#include "./layers_common.simd.hpp" |
||||
#include "int8layers/layers_common.simd_declarations.hpp" |
||||
#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
#include "../ocl4dnn/include/ocl4dnn.hpp" |
||||
#endif |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
void getConvolutionKernelParams(const LayerParams ¶ms, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin, |
||||
std::vector<size_t>& pads_end, std::vector<size_t>& strides, std::vector<size_t>& dilations, |
||||
cv::String &padMode, std::vector<size_t>& adjust_pads); |
||||
|
||||
void getPoolingKernelParams(const LayerParams ¶ms, std::vector<size_t>& kernel, std::vector<bool>& globalPooling, |
||||
std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end, std::vector<size_t>& strides, cv::String &padMode); |
||||
|
||||
void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel, |
||||
const std::vector<size_t>& stride, const String &padMode, |
||||
const std::vector<size_t>& dilation, std::vector<int>& out); |
||||
|
||||
void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel, |
||||
const std::vector<size_t>& strides, const String &padMode, |
||||
std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end); |
||||
} |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,637 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "opencv2/core/hal/intrin.hpp" |
||||
|
||||
namespace cv { |
||||
namespace dnn { |
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN |
||||
|
||||
void fastConv( const int8_t* weights, size_t wstep, const int* bias, |
||||
const int8_t* rowbuf, int* output, const int* outShape, |
||||
int blockSize, int vecsize, int vecsize_aligned, int outZp, |
||||
const float* multiplier, bool initOutput, bool finalOutput ); |
||||
void fastDepthwiseConv( const int8_t* wptr, |
||||
int kernel_h, int kernel_w, |
||||
int stride_h, int stride_w, |
||||
int dilation_h, int dilation_w, |
||||
int pad_t, int pad_l, |
||||
const int* biasptr, const float* multptr, |
||||
const int8_t* inptr_, |
||||
int height, int width, |
||||
int* outptr_, |
||||
int out_d, int outH, int outW, |
||||
int inpZp, int outZp ); |
||||
void fastGEMM1T( const int8_t* vec, const int8_t* weights, |
||||
size_t wstep, const int* bias, const float* multiplier, |
||||
int* dst, int nvecs, int vecsize, int outZp ); |
||||
|
||||
#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX2 |
||||
#define OPENCV_FMADD_EPI8(_Tpvec, func) \ |
||||
inline _Tpvec _##func##_fmaddepi8_epi32(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ \
|
||||
_Tpvec even_a = _##func##_srai_epi16(_##func##_bslli_epi128(a, 1), 8); \
|
||||
_Tpvec odd_a = _##func##_srai_epi16(a, 8); \
|
||||
\
|
||||
_Tpvec even_b = _##func##_srai_epi16(_##func##_bslli_epi128(b, 1), 8); \
|
||||
_Tpvec odd_b = _##func##_srai_epi16(b, 8); \
|
||||
\
|
||||
_Tpvec prod0 = _##func##_madd_epi16(even_a, even_b); \
|
||||
_Tpvec prod1 = _##func##_madd_epi16(odd_a, odd_b); \
|
||||
return _##func##_add_epi32(_##func##_add_epi32(prod0, prod1), c); \
|
||||
} |
||||
OPENCV_FMADD_EPI8(__m256i, mm256) |
||||
//OPENCV_FMADD_EPI8(__m512i, mm512)
|
||||
|
||||
enum { FASCONV_BASE_VECSZ = 4 }; |
||||
|
||||
void fastConv( const int8_t* weights, size_t wstep, const int* bias, |
||||
const int8_t* rowbuf, int* output, const int* outShape, |
||||
int blockSize, int vecsize, int vecsize_aligned, int outZp, |
||||
const float* multiplier, bool initOutput, bool finalOutput ) |
||||
{ |
||||
int outCn = outShape[1]; |
||||
size_t outPlaneSize = outShape[2]*outShape[3]; |
||||
int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0}; |
||||
int rsz = blockSize % FASCONV_BASE_VECSZ; |
||||
for( int i = 0; i < rsz; i++ ) |
||||
maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1; |
||||
__m128 mask = _mm_loadu_ps((const float*)maskbuf); |
||||
|
||||
// now compute dot product of the weights
|
||||
// and im2row-transformed part of the tensor
|
||||
for( int i = 0; i < outCn; i += 3 ) |
||||
{ |
||||
const int8_t* wptr0 = weights + i*wstep; |
||||
const int8_t* wptr1 = wptr0 + wstep; |
||||
const int8_t* wptr2 = wptr1 + wstep; |
||||
int* outptr0 = output + i*outPlaneSize; |
||||
int* outptr1 = outptr0 + outPlaneSize; |
||||
int* outptr2 = outptr1 + outPlaneSize; |
||||
int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; |
||||
float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2]; |
||||
|
||||
if( i+2 >= outCn ) |
||||
{ |
||||
wptr2 = wptr1; |
||||
outptr2 = outptr1; |
||||
bias2 = bias1; |
||||
mult2 = mult1; |
||||
|
||||
if( i+1 >= outCn ) |
||||
{ |
||||
wptr2 = wptr1 = wptr0; |
||||
outptr2 = outptr1 = outptr0; |
||||
bias2 = bias1 = bias0; |
||||
mult2 = mult1 = mult0; |
||||
} |
||||
} |
||||
int j = 0; |
||||
for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) |
||||
{ |
||||
bool tail = false; |
||||
if (j + FASCONV_BASE_VECSZ > blockSize) |
||||
{ |
||||
if (j == 0) |
||||
break; |
||||
j = blockSize - FASCONV_BASE_VECSZ; |
||||
tail = true; |
||||
} |
||||
int k = 0; |
||||
const int8_t* rptr = rowbuf + j*vecsize_aligned; |
||||
|
||||
__m256i vs00 = _mm256_setzero_si256(), vs01 = _mm256_setzero_si256(), |
||||
vs02 = _mm256_setzero_si256(), vs03 = _mm256_setzero_si256(), |
||||
vs10 = _mm256_setzero_si256(), vs11 = _mm256_setzero_si256(), |
||||
vs12 = _mm256_setzero_si256(), vs13 = _mm256_setzero_si256(), |
||||
vs20 = _mm256_setzero_si256(), vs21 = _mm256_setzero_si256(), |
||||
vs22 = _mm256_setzero_si256(), vs23 = _mm256_setzero_si256(); |
||||
|
||||
/* TODO : Fix AVX-512 path. Segmentation fault in Conv2D Tests.
|
||||
#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling
|
||||
if (vecsize >= 64) |
||||
{ |
||||
__m512i vs00_5 = _mm512_setzero_si512(), vs01_5 = _mm512_setzero_si512(), |
||||
vs02_5 = _mm512_setzero_si512(), vs03_5 = _mm512_setzero_si512(), |
||||
vs10_5 = _mm512_setzero_si512(), vs11_5 = _mm512_setzero_si512(), |
||||
vs12_5 = _mm512_setzero_si512(), vs13_5 = _mm512_setzero_si512(), |
||||
vs20_5 = _mm512_setzero_si512(), vs21_5 = _mm512_setzero_si512(), |
||||
vs22_5 = _mm512_setzero_si512(), vs23_5 = _mm512_setzero_si512(); |
||||
|
||||
for (; k <= vecsize - 64; k += 64, rptr += 64) |
||||
{ |
||||
__m512i w0 = _mm512_load_si512(wptr0 + k); |
||||
__m512i w1 = _mm512_load_si512(wptr1 + k); |
||||
__m512i w2 = _mm512_load_si512(wptr2 + k); |
||||
__m512i r0 = _mm512_load_si512(rptr); |
||||
|
||||
vs00_5 = _mm512_fmaddepi8_epi32(w0, r0, vs00_5); |
||||
vs10_5 = _mm512_fmaddepi8_epi32(w1, r0, vs10_5); |
||||
vs20_5 = _mm512_fmaddepi8_epi32(w2, r0, vs20_5); |
||||
|
||||
r0 = _mm512_load_si512(rptr + vecsize_aligned); |
||||
vs01_5 = _mm512_fmaddepi8_epi32(w0, r0, vs01_5); |
||||
vs11_5 = _mm512_fmaddepi8_epi32(w1, r0, vs11_5); |
||||
vs21_5 = _mm512_fmaddepi8_epi32(w2, r0, vs21_5); |
||||
|
||||
r0 = _mm512_load_si512(rptr + vecsize_aligned*2); |
||||
vs02_5 = _mm512_fmaddepi8_epi32(w0, r0, vs02_5); |
||||
vs12_5 = _mm512_fmaddepi8_epi32(w1, r0, vs12_5); |
||||
vs22_5 = _mm512_fmaddepi8_epi32(w2, r0, vs22_5); |
||||
|
||||
r0 = _mm512_load_si512(rptr + vecsize_aligned*3); |
||||
vs03_5 = _mm512_fmaddepi8_epi32(w0, r0, vs03_5); |
||||
vs13_5 = _mm512_fmaddepi8_epi32(w1, r0, vs13_5); |
||||
vs23_5 = _mm512_fmaddepi8_epi32(w2, r0, vs23_5); |
||||
} |
||||
|
||||
// now fold the 512 bit accumulator vectors into 256 bit vectors so that the AVX2 code can finish
|
||||
// the tail of the vector
|
||||
|
||||
vs00 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs00_5, 0), _mm512_extracti32x8_epi32(vs00_5, 1)); |
||||
vs10 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs10_5, 0), _mm512_extracti32x8_epi32(vs10_5, 1)); |
||||
vs20 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs20_5, 0), _mm512_extracti32x8_epi32(vs20_5, 1)); |
||||
|
||||
vs01 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs01_5, 0), _mm512_extracti32x8_epi32(vs01_5, 1)); |
||||
vs11 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs11_5, 0), _mm512_extracti32x8_epi32(vs11_5, 1)); |
||||
vs21 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs21_5, 0), _mm512_extracti32x8_epi32(vs21_5, 1)); |
||||
|
||||
vs02 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs02_5, 0), _mm512_extracti32x8_epi32(vs02_5, 1)); |
||||
vs12 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs12_5, 0), _mm512_extracti32x8_epi32(vs12_5, 1)); |
||||
vs22 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs22_5, 0), _mm512_extracti32x8_epi32(vs22_5, 1)); |
||||
|
||||
vs03 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs03_5, 0), _mm512_extracti32x8_epi32(vs03_5, 1)); |
||||
vs13 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs13_5, 0), _mm512_extracti32x8_epi32(vs13_5, 1)); |
||||
vs23 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs23_5, 0), _mm512_extracti32x8_epi32(vs23_5, 1)); |
||||
} |
||||
#endif |
||||
*/ |
||||
for (; k < vecsize; k += 32, rptr += 32 ) |
||||
{ |
||||
__m256i w0 = _mm256_load_si256((const __m256i*)(wptr0 + k)); |
||||
__m256i w1 = _mm256_load_si256((const __m256i*)(wptr1 + k)); |
||||
__m256i w2 = _mm256_load_si256((const __m256i*)(wptr2 + k)); |
||||
__m256i r0 = _mm256_load_si256((const __m256i*)rptr); |
||||
|
||||
vs00 = _mm256_fmaddepi8_epi32(w0, r0, vs00); |
||||
vs10 = _mm256_fmaddepi8_epi32(w1, r0, vs10); |
||||
vs20 = _mm256_fmaddepi8_epi32(w2, r0, vs20); |
||||
|
||||
r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned)); |
||||
vs01 = _mm256_fmaddepi8_epi32(w0, r0, vs01); |
||||
vs11 = _mm256_fmaddepi8_epi32(w1, r0, vs11); |
||||
vs21 = _mm256_fmaddepi8_epi32(w2, r0, vs21); |
||||
|
||||
r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*2)); |
||||
vs02 = _mm256_fmaddepi8_epi32(w0, r0, vs02); |
||||
vs12 = _mm256_fmaddepi8_epi32(w1, r0, vs12); |
||||
vs22 = _mm256_fmaddepi8_epi32(w2, r0, vs22); |
||||
|
||||
r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*3)); |
||||
vs03 = _mm256_fmaddepi8_epi32(w0, r0, vs03); |
||||
vs13 = _mm256_fmaddepi8_epi32(w1, r0, vs13); |
||||
vs23 = _mm256_fmaddepi8_epi32(w2, r0, vs23); |
||||
} |
||||
|
||||
__m256i t0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs00, vs01), _mm256_hadd_epi32(vs02, vs03)); |
||||
__m256i t1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs10, vs11), _mm256_hadd_epi32(vs12, vs13)); |
||||
__m256i t2 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs20, vs21), _mm256_hadd_epi32(vs22, vs23)); |
||||
|
||||
t0 = _mm256_add_epi32(t0, _mm256_permute2x128_si256(t0, t0, 1)); |
||||
t1 = _mm256_add_epi32(t1, _mm256_permute2x128_si256(t1, t1, 1)); |
||||
t2 = _mm256_add_epi32(t2, _mm256_permute2x128_si256(t2, t2, 1)); |
||||
|
||||
__m128i s0, s1, s2; |
||||
|
||||
if( initOutput ) |
||||
{ |
||||
s0 = _mm_set1_epi32(bias0); |
||||
s1 = _mm_set1_epi32(bias1); |
||||
s2 = _mm_set1_epi32(bias2); |
||||
} |
||||
else |
||||
{ |
||||
s0 = _mm_loadu_si128((__m128i*)(outptr0 + j)); |
||||
s1 = _mm_loadu_si128((__m128i*)(outptr1 + j)); |
||||
s2 = _mm_loadu_si128((__m128i*)(outptr2 + j)); |
||||
} |
||||
|
||||
s0 = _mm_add_epi32(s0, _mm256_castsi256_si128(t0)); |
||||
s1 = _mm_add_epi32(s1, _mm256_castsi256_si128(t1)); |
||||
s2 = _mm_add_epi32(s2, _mm256_castsi256_si128(t2)); |
||||
|
||||
if( finalOutput ) |
||||
{ |
||||
__m128i voutzp = _mm_set1_epi32(outZp); |
||||
__m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127); |
||||
s0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s0), _mm_set1_ps(mult0)))); |
||||
s1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s1), _mm_set1_ps(mult1)))); |
||||
s2 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s2), _mm_set1_ps(mult2)))); |
||||
|
||||
s0 = _mm_min_epi32(_mm_max_epi32(s0, outmin), outmax); |
||||
s1 = _mm_min_epi32(_mm_max_epi32(s1, outmin), outmax); |
||||
s2 = _mm_min_epi32(_mm_max_epi32(s2, outmin), outmax); |
||||
} |
||||
if( tail ) |
||||
{ |
||||
s0 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr0 + j), _mm_castsi128_ps(s0), mask)); |
||||
s1 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr1 + j), _mm_castsi128_ps(s1), mask)); |
||||
s2 = _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr2 + j), _mm_castsi128_ps(s2), mask)); |
||||
} |
||||
_mm_storeu_si128((__m128i*)(outptr0 + j), s0); |
||||
_mm_storeu_si128((__m128i*)(outptr1 + j), s1); |
||||
_mm_storeu_si128((__m128i*)(outptr2 + j), s2); |
||||
} |
||||
|
||||
for( ; j <= blockSize - 2; j += 2 ) |
||||
{ |
||||
const int8_t* rptr0 = rowbuf + j*vecsize_aligned; |
||||
const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned; |
||||
int s00, s01, s10, s11, s20, s21; |
||||
|
||||
if( initOutput ) |
||||
{ |
||||
s00 = s01 = bias0; |
||||
s10 = s11 = bias1; |
||||
s20 = s21 = bias2; |
||||
} |
||||
else |
||||
{ |
||||
s00 = outptr0[j]; s01 = outptr0[j+1]; |
||||
s10 = outptr1[j]; s11 = outptr1[j+1]; |
||||
s20 = outptr2[j]; s21 = outptr2[j+1]; |
||||
} |
||||
|
||||
for( int k = 0; k < vecsize; k++ ) |
||||
{ |
||||
int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; |
||||
int8_t r = rptr0[k]; |
||||
s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r; |
||||
r = rptr1[k]; |
||||
s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r; |
||||
} |
||||
|
||||
if( finalOutput ) |
||||
{ |
||||
s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127); |
||||
s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127); |
||||
s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127); |
||||
s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127); |
||||
s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127); |
||||
s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127); |
||||
} |
||||
outptr0[j] = s00; |
||||
outptr0[j+1] = s01; |
||||
outptr1[j] = s10; |
||||
outptr1[j+1] = s11; |
||||
outptr2[j] = s20; |
||||
outptr2[j+1] = s21; |
||||
} |
||||
|
||||
for( ; j < blockSize; j++ ) |
||||
{ |
||||
const int8_t* rptr0 = rowbuf + j*vecsize_aligned; |
||||
int s00, s10, s20; |
||||
|
||||
if( initOutput ) |
||||
{ |
||||
s00 = bias0; |
||||
s10 = bias1; |
||||
s20 = bias2; |
||||
} |
||||
else |
||||
{ |
||||
s00 = outptr0[j]; |
||||
s10 = outptr1[j]; |
||||
s20 = outptr2[j]; |
||||
} |
||||
|
||||
for( int k = 0; k < vecsize; k++ ) |
||||
{ |
||||
int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; |
||||
int8_t r = rptr0[k]; |
||||
s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r; |
||||
} |
||||
|
||||
if( finalOutput ) |
||||
{ |
||||
s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127); |
||||
s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127); |
||||
s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127); |
||||
} |
||||
outptr0[j] = s00; |
||||
outptr1[j] = s10; |
||||
outptr2[j] = s20; |
||||
} |
||||
} |
||||
_mm256_zeroupper(); |
||||
} |
||||
|
||||
static inline void _mm256_expand_mul_add(const __m256i& a, const __m256i& b, |
||||
__m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3) |
||||
{ |
||||
__m256i a0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a)); |
||||
__m256i a1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a, 1)); |
||||
|
||||
__m256i b0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(b)); |
||||
__m256i b1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b, 1)); |
||||
|
||||
__m256i a0b0 = _mm256_mullo_epi16(a0, b0); |
||||
__m256i a1b1 = _mm256_mullo_epi16(a1, b1); |
||||
|
||||
out0 = _mm256_add_epi32(out0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a0b0))); |
||||
out1 = _mm256_add_epi32(out1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a0b0, 1))); |
||||
out2 = _mm256_add_epi32(out2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a1b1))); |
||||
out3 = _mm256_add_epi32(out3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a1b1, 1))); |
||||
} |
||||
|
||||
static inline void _mm256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b) |
||||
{ |
||||
__m256i t0 = _mm256_loadu_si256((const __m256i*)ptr); |
||||
__m256i t1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); |
||||
|
||||
const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, |
||||
0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); |
||||
__m256i p0 = _mm256_shuffle_epi8(t0, sh); |
||||
__m256i p1 = _mm256_shuffle_epi8(t1, sh); |
||||
__m256i lo = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); |
||||
__m256i hi = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); |
||||
a = _mm256_unpacklo_epi64(lo, hi); |
||||
b = _mm256_unpackhi_epi64(lo, hi); |
||||
} |
||||
|
||||
void fastDepthwiseConv( const int8_t* wptr, |
||||
int kernel_h, int kernel_w, |
||||
int stride_h, int stride_w, |
||||
int dilation_h, int dilation_w, |
||||
int pad_t, int pad_l, |
||||
const int* biasptr, const float* multptr, |
||||
const int8_t* inptr_, |
||||
int height, int width, |
||||
int* outptr_, |
||||
int out_d, int outH, int outW, |
||||
int inpZp, int outZp) |
||||
{ |
||||
const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], |
||||
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], |
||||
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; |
||||
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); |
||||
float mult = multptr[out_d]; |
||||
int bias = biasptr[out_d]; |
||||
int biasCopy; |
||||
|
||||
for (int out_i = 0; out_i < outH; out_i++) |
||||
{ |
||||
int in_i = out_i * stride_h - pad_t, out_j = 0; |
||||
const int8_t* imgptr0 = inptr_ + in_i*width; |
||||
const int8_t* imgptr1 = imgptr0 + dilation_h*width; |
||||
const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width; |
||||
int8_t w00 = w00_, w01 = w01_, w02 = w02_; |
||||
int8_t w20 = w20_, w21 = w21_, w22 = w22_; |
||||
int out; |
||||
biasCopy = bias; |
||||
if (in_i < 0) |
||||
{ |
||||
biasCopy += inpZp * (w00 + w01 + w02); |
||||
w00 = w01 = w02 = 0; |
||||
imgptr0 = imgptr1; |
||||
} |
||||
else if (in_i + dilation_h*(kernel_h-1) >= height) |
||||
{ |
||||
biasCopy += inpZp * (w20 + w21 + w22); |
||||
w20 = w21 = w22 = 0; |
||||
imgptr2 = imgptr1; |
||||
} |
||||
int* outptr = outptr_ + out_i*outW; |
||||
if (pad_l > 0) |
||||
{ |
||||
out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 + |
||||
(int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 + |
||||
(int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 + |
||||
biasCopy + inpZp*(w00 + w10 + w20); |
||||
outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); |
||||
out_j = 1; |
||||
} |
||||
|
||||
if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) |
||||
{ |
||||
const int VECSZ = 32; |
||||
__m256i vw00 = _mm256_set1_epi8(w00), vw01 = _mm256_set1_epi8(w01), vw02 = _mm256_set1_epi8(w02), |
||||
vw10 = _mm256_set1_epi8(w10), vw11 = _mm256_set1_epi8(w11), vw12 = _mm256_set1_epi8(w12), |
||||
vw20 = _mm256_set1_epi8(w20), vw21 = _mm256_set1_epi8(w21), vw22 = _mm256_set1_epi8(w22); |
||||
__m256i vbias = _mm256_set1_epi32(biasCopy), voutzp = _mm256_set1_epi32(outZp), |
||||
outmin = _mm256_set1_epi32(-128), outmax = _mm256_set1_epi32(127); |
||||
__m256 vmult = _mm256_set1_ps(mult); |
||||
__m256i vout0, vout1, vout2, vout3; |
||||
|
||||
if( stride_w == 1 ) |
||||
{ |
||||
for( ; out_j < outW1; out_j += VECSZ ) |
||||
{ |
||||
if (out_j + VECSZ > outW1) |
||||
{ |
||||
if (out_j <= pad_l) |
||||
break; |
||||
out_j = outW1 - VECSZ; |
||||
} |
||||
int in_j = out_j * stride_w - pad_l; |
||||
__m256i v00 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j)), |
||||
v01 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w)), |
||||
v02 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w*2)), |
||||
v10 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j)), |
||||
v11 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w)), |
||||
v12 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w*2)), |
||||
v20 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j)), |
||||
v21 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w)), |
||||
v22 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w*2)); |
||||
|
||||
vout0 = vout1 = vout2 = vout3 = vbias; |
||||
_mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3); |
||||
|
||||
vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult))); |
||||
vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult))); |
||||
vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult))); |
||||
vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult))); |
||||
|
||||
vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax); |
||||
vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax); |
||||
vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax); |
||||
vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax); |
||||
|
||||
_mm256_storeu_si256((__m256i*)(outptr + out_j), vout0); |
||||
_mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1); |
||||
_mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2); |
||||
_mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for( ; out_j < outW1; out_j += VECSZ ) |
||||
{ |
||||
if (out_j + VECSZ > outW1) |
||||
{ |
||||
if (out_j <= pad_l) |
||||
break; |
||||
out_j = outW1 - VECSZ; |
||||
} |
||||
int in_j = out_j * stride_w - pad_l; |
||||
__m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; |
||||
_mm256_load_deinterleave(imgptr0 + in_j, v00, v01); |
||||
_mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); |
||||
_mm256_load_deinterleave(imgptr1 + in_j, v10, v11); |
||||
_mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); |
||||
_mm256_load_deinterleave(imgptr2 + in_j, v20, v21); |
||||
_mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); |
||||
|
||||
vout0 = vout1 = vout2 = vout3 = vbias; |
||||
_mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3); |
||||
_mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3); |
||||
|
||||
vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult))); |
||||
vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult))); |
||||
vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult))); |
||||
vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult))); |
||||
|
||||
vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax); |
||||
vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax); |
||||
vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax); |
||||
vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax); |
||||
|
||||
_mm256_storeu_si256((__m256i*)(outptr + out_j), vout0); |
||||
_mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1); |
||||
_mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2); |
||||
_mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3); |
||||
} |
||||
} |
||||
} |
||||
|
||||
for (; out_j < outW1; out_j++) |
||||
{ |
||||
int in_j = out_j * stride_w - pad_l; |
||||
out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 + |
||||
(int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 + |
||||
(int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy; |
||||
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); |
||||
} |
||||
|
||||
for (; out_j < outW; out_j++ ) |
||||
{ |
||||
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; |
||||
int s0 = 1, s1 = 1, s2 = 1; |
||||
if (in_j0 >= width) |
||||
{ |
||||
in_j0 = 0; |
||||
s0 = 0; |
||||
biasCopy += inpZp*(w00 + w10 + w20); |
||||
} |
||||
if (in_j1 >= width) |
||||
{ |
||||
in_j1 = 0; |
||||
s1 = 0; |
||||
biasCopy += inpZp*(w01 + w11 + w21); |
||||
} |
||||
if (in_j2 >= width) |
||||
{ |
||||
in_j2 = 0; |
||||
s2 = 0; |
||||
biasCopy += inpZp*(w02 + w12 + w22); |
||||
} |
||||
out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 + |
||||
(int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 + |
||||
(int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy; |
||||
outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); |
||||
} |
||||
} |
||||
_mm256_zeroupper(); |
||||
} |
||||
|
||||
// dst = vec * weights^t + bias
|
||||
void fastGEMM1T( const int8_t* vec, const int8_t* weights, |
||||
size_t wstep, const int* bias, const float* multiplier, |
||||
int* dst, int nvecs, int vecsize, int outZp ) |
||||
{ |
||||
int i = 0; |
||||
|
||||
for( ; i <= nvecs - 8; i += 8 ) |
||||
{ |
||||
const int8_t* wptr = weights + i*wstep; |
||||
__m256i vs0 = _mm256_setzero_si256(), vs1 = _mm256_setzero_si256(), |
||||
vs2 = _mm256_setzero_si256(), vs3 = _mm256_setzero_si256(), |
||||
vs4 = _mm256_setzero_si256(), vs5 = _mm256_setzero_si256(), |
||||
vs6 = _mm256_setzero_si256(), vs7 = _mm256_setzero_si256(); |
||||
|
||||
__m128i voutzp = _mm_set1_epi32(outZp); |
||||
__m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127); |
||||
|
||||
for( int k = 0; k < vecsize; k += 32, wptr += 32 ) |
||||
{ |
||||
__m256i v = _mm256_load_si256((const __m256i*)(vec + k)); |
||||
|
||||
vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0); |
||||
vs1 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep)), v, vs1); |
||||
vs2 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*2)), v, vs2); |
||||
vs3 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*3)), v, vs3); |
||||
vs4 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*4)), v, vs4); |
||||
vs5 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*5)), v, vs5); |
||||
vs6 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*6)), v, vs6); |
||||
vs7 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*7)), v, vs7); |
||||
} |
||||
|
||||
__m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs1), _mm256_hadd_epi32(vs2, vs3)); |
||||
__m256i s1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs4, vs5), _mm256_hadd_epi32(vs6, vs7)); |
||||
|
||||
s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1)); |
||||
s1 = _mm256_add_epi32(s1, _mm256_permute2x128_si256(s1, s1, 1)); |
||||
|
||||
__m128i t0 = _mm_add_epi32(_mm256_castsi256_si128(s0), _mm_loadu_si128((__m128i*)(bias + i))); |
||||
__m128i t1 = _mm_add_epi32(_mm256_castsi256_si128(s1), _mm_loadu_si128((__m128i*)(bias + i + 4))); |
||||
|
||||
t0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t0), _mm_loadu_ps(multiplier + i)))); |
||||
t1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t1), _mm_loadu_ps(multiplier + i + 4)))); |
||||
|
||||
t0 = _mm_min_epi32(_mm_max_epi32(t0, outmin), outmax); |
||||
t1 = _mm_min_epi32(_mm_max_epi32(t1, outmin), outmax); |
||||
|
||||
_mm_storeu_si128((__m128i*)(dst + i), t0); |
||||
_mm_storeu_si128((__m128i*)(dst + i + 4), t1); |
||||
} |
||||
|
||||
for( ; i < nvecs; i++ ) |
||||
{ |
||||
const int8_t* wptr = weights + i*wstep; |
||||
__m256i vs0 = _mm256_setzero_si256(); |
||||
|
||||
for( int k = 0; k < vecsize; k += 32, wptr += 32 ) |
||||
{ |
||||
__m256i v = _mm256_load_si256((const __m256i*)(vec + k)); |
||||
vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0); |
||||
} |
||||
|
||||
__m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs0), vs0); |
||||
s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1)); |
||||
int temp = _mm_extract_epi32(_mm256_castsi256_si128(s0), 0); |
||||
dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]); |
||||
} |
||||
|
||||
_mm256_zeroupper(); |
||||
} |
||||
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
|
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END |
||||
}} // namespace
|
@ -0,0 +1,595 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../precomp.hpp" |
||||
#include "layers_common.hpp" |
||||
#include "opencv2/core/hal/intrin.hpp" |
||||
|
||||
#include <float.h> |
||||
#include <algorithm> |
||||
#include <numeric> |
||||
using std::max; |
||||
using std::min; |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
|
||||
class PoolingLayerInt8Impl CV_FINAL : public PoolingLayerInt8 |
||||
{ |
||||
public: |
||||
PoolingLayerInt8Impl(const LayerParams& params) |
||||
{ |
||||
computeMaxIdx = false; |
||||
globalPooling = false; |
||||
isGlobalPooling = std::vector<bool>(3, false); |
||||
output_zp = params.get<int>("zeropoints"); |
||||
input_zp = params.get<int>("input_zeropoint", 0); |
||||
multiplier = params.get<float>("multiplier", 1.f); |
||||
|
||||
hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false); |
||||
shapesInitialized = !hasDynamicShapes; |
||||
|
||||
if (params.has("pool") || params.has("kernel_size") || |
||||
params.has("kernel_w") || params.has("kernel_h")) |
||||
{ |
||||
String pool = toLowerCase(params.get<String>("pool", "max")); |
||||
if (pool == "max") |
||||
type = MAX; |
||||
else if (pool == "ave") |
||||
type = AVE; |
||||
else if (pool == "sum") |
||||
type = SUM; |
||||
else |
||||
CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\""); |
||||
|
||||
getPoolingKernelParams(params, kernel_size, isGlobalPooling, pads_begin, pads_end, strides, padMode); |
||||
globalPooling = isGlobalPooling[0] || isGlobalPooling[1] || isGlobalPooling[2]; |
||||
} |
||||
else |
||||
CV_Error(Error::StsBadArg, "Cannot determine pooling type"); |
||||
setParamsFrom(params); |
||||
ceilMode = params.get<bool>("ceil_mode", true); |
||||
spatialScale = params.get<float>("spatial_scale", 1); |
||||
avePoolPaddedArea = params.get<bool>("ave_pool_padded_area", true); |
||||
} |
||||
|
||||
void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE |
||||
{ |
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
|
||||
CV_Assert(!inputs.empty()); |
||||
CV_Assert(outputs.size() == 1); |
||||
|
||||
std::vector<int> inp; |
||||
std::vector<int> out; |
||||
for (int i = 2; i < inputs[0].dims; i++) { |
||||
inp.push_back(inputs[0].size[i]); |
||||
out.push_back(outputs[0].size[i]); |
||||
} |
||||
if (globalPooling) { |
||||
std::vector<size_t> finalKernel; |
||||
for (int i = 0; i < inp.size(); i++) { |
||||
int idx = isGlobalPooling.size() - inp.size() + i; |
||||
finalKernel.push_back(isGlobalPooling[idx] ? inp[i] : kernel_size[idx]); |
||||
} |
||||
kernel_size = finalKernel; |
||||
} |
||||
|
||||
getConvPoolPaddings(inp, kernel_size, strides, padMode, pads_begin, pads_end); |
||||
|
||||
if (inputs[0].dims == 3) |
||||
{ |
||||
// Pool1D
|
||||
kernel_size.assign(1, kernel_size[0]); |
||||
strides.assign(1, strides[0]); |
||||
pads_begin.assign(1, pads_begin[0]); |
||||
pads_end.assign(1, pads_end[0]); |
||||
} |
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
if (backendId == DNN_BACKEND_OPENCV) |
||||
{ |
||||
if (kernel_size.size() == 3) |
||||
return preferableTarget == DNN_TARGET_CPU; |
||||
if (kernel_size.size() <= 2) |
||||
return true; |
||||
else |
||||
return false; |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE |
||||
{ |
||||
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>(); |
||||
if (!activ_int8.empty()) |
||||
{ |
||||
return activ_int8->blobs.empty(); |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); |
||||
|
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
|
||||
switch (type) |
||||
{ |
||||
case MAX: |
||||
{ |
||||
CV_Assert_N(inputs.size() == 1, outputs.size() == 1); |
||||
maxPooling(inputs[0], outputs[0]); |
||||
break; |
||||
} |
||||
case AVE: case SUM: |
||||
CV_Assert_N(inputs.size() == 1, outputs.size() == 1); |
||||
avePooling(inputs[0], outputs[0]); |
||||
break; |
||||
default: |
||||
CV_Error(Error::StsNotImplemented, "Not implemented"); |
||||
break; |
||||
} |
||||
} |
||||
|
||||
class PoolingInvoker : public ParallelLoopBody |
||||
{ |
||||
public: |
||||
const Mat* src, *rois; |
||||
Mat *dst; |
||||
int pad_l, pad_t, pad_r, pad_b; |
||||
bool avePoolPaddedArea; |
||||
int nstripes, inpZp, outZp; |
||||
std::vector<int> ofsbuf; |
||||
int poolingType; |
||||
float multiplier; |
||||
float spatialScale; |
||||
|
||||
std::vector<size_t> pads_begin, pads_end; |
||||
std::vector<size_t> kernel_size; |
||||
std::vector<size_t> strides; |
||||
|
||||
PoolingInvoker() : src(0), rois(0), dst(0), pad_l(0), pad_t(0), pad_r(0), pad_b(0), |
||||
avePoolPaddedArea(false), nstripes(0), inpZp(0), outZp(0), |
||||
poolingType(MAX), multiplier(1), spatialScale(0){} |
||||
|
||||
static void run(const Mat& src, const Mat& rois, Mat& dst, |
||||
std::vector<size_t> kernel_size, std::vector<size_t> strides, |
||||
std::vector<size_t> pads_begin, std::vector<size_t> pads_end, |
||||
bool avePoolPaddedArea, int poolingType, float spatialScale, |
||||
float multiplier, int inpZp, int outZp, int nstripes) |
||||
{ |
||||
CV_Assert_N( |
||||
src.isContinuous(), dst.isContinuous(), |
||||
src.type() == CV_8S, src.type() == dst.type(), |
||||
src.dims == 3 || src.dims == 4 || src.dims == 5, dst.dims == 3 || dst.dims == 4 || dst.dims == 5, |
||||
src.size[0] == dst.size[0], src.size[1] == dst.size[1], rois.empty()); |
||||
|
||||
PoolingInvoker p; |
||||
|
||||
bool isPool1D = src.dims == 3; |
||||
bool isPool3D = src.dims == 5; |
||||
|
||||
p.src = &src; |
||||
p.rois = &rois; |
||||
p.dst = &dst; |
||||
|
||||
p.kernel_size = kernel_size; |
||||
p.strides = strides; |
||||
p.pads_begin = pads_begin; |
||||
p.pads_end = pads_end; |
||||
|
||||
p.pad_l = pads_begin.back(); |
||||
p.pad_t = isPool1D ? 0 : pads_begin[pads_begin.size() - 2]; |
||||
p.pad_r = pads_end.back(); |
||||
p.pad_b = isPool1D ? 0 : pads_end[pads_end.size() - 2]; |
||||
|
||||
p.avePoolPaddedArea = avePoolPaddedArea; |
||||
p.nstripes = nstripes; |
||||
p.inpZp = inpZp; |
||||
p.outZp = outZp; |
||||
p.poolingType = poolingType; |
||||
p.spatialScale = spatialScale; |
||||
p.multiplier = multiplier; |
||||
|
||||
int height = isPool1D ? 1 : src.size[src.dims - 2]; |
||||
int width = src.size[src.dims - 1]; |
||||
|
||||
int kernel_d = isPool3D ? kernel_size[0] : 1; |
||||
int kernel_h = isPool1D ? 1 : kernel_size[kernel_size.size() - 2]; |
||||
int kernel_w = kernel_size.back(); |
||||
|
||||
p.ofsbuf.resize(kernel_d * kernel_h * kernel_w); |
||||
for (int i = 0; i < kernel_d; ++i) { |
||||
for (int j = 0; j < kernel_h; ++j) { |
||||
for (int k = 0; k < kernel_w; ++k) { |
||||
p.ofsbuf[i * kernel_h * kernel_w + j * kernel_w + k] = width * height * i + width * j + k; |
||||
} |
||||
} |
||||
} |
||||
|
||||
parallel_for_(Range(0, nstripes), p, nstripes); |
||||
} |
||||
|
||||
void operator()(const Range& r) const CV_OVERRIDE |
||||
{ |
||||
int channels = dst->size[1]; |
||||
|
||||
bool isPool3D = src->dims == 5; |
||||
bool isPool2D = src->dims == 4; |
||||
bool isPool1D = src->dims == 3; |
||||
int depth = isPool3D? dst->size[2] : 1; |
||||
int height = isPool1D? 1 : dst->size[dst->dims - 2]; |
||||
int width = dst->size[dst->dims - 1]; |
||||
|
||||
int inp_depth = isPool3D? src->size[2] : 1; |
||||
int inp_height = isPool1D? 1 : src->size[src->dims - 2]; |
||||
int inp_width = src->size[src->dims - 1]; |
||||
|
||||
size_t total = dst->total(); |
||||
size_t stripeSize = (total + nstripes - 1)/nstripes; |
||||
size_t stripeStart = r.start*stripeSize; |
||||
size_t stripeEnd = std::min(r.end*stripeSize, total); |
||||
|
||||
int kernel_d = isPool3D? kernel_size[0] : 1; |
||||
int kernel_h = isPool1D? 1 : kernel_size[kernel_size.size() - 2]; |
||||
int kernel_w = kernel_size.back(); |
||||
|
||||
int stride_d = isPool3D? strides[0] : 0; |
||||
int stride_h = isPool1D? 1 :strides[strides.size() - 2]; |
||||
int stride_w = strides.back(); |
||||
|
||||
#if CV_SIMD128 |
||||
const int* ofsptr = (const int*)&ofsbuf[0]; |
||||
if (poolingType == MAX && !ofsptr) |
||||
CV_Error(Error::StsBadArg, "ofsbuf should be initialized in this mode"); |
||||
#endif |
||||
|
||||
for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; ) |
||||
{ |
||||
size_t ofs = ofs0; |
||||
int x0 = (int)(ofs % width); |
||||
ofs /= width; |
||||
int y0 = (int)(ofs % height); |
||||
ofs /= height; |
||||
|
||||
int d0 = (int)(ofs % depth); |
||||
ofs /= depth; |
||||
|
||||
int c = (int)(ofs % channels); |
||||
int n = (int)(ofs / channels); |
||||
int ystart, yend; |
||||
int dstart = 0, dend = 1; |
||||
|
||||
const int8_t *srcData = 0; |
||||
int pad_d_begin = (pads_begin.size() == 3) ? pads_begin[0] : 0; |
||||
dstart = d0 * stride_d - pad_d_begin; |
||||
dend = min(dstart + kernel_d, (int)(inp_depth + pads_end[0])); |
||||
|
||||
ystart = y0 * stride_h - pad_t; |
||||
yend = min(ystart + kernel_h, inp_height + pad_b); |
||||
srcData = src->ptr<int8_t>(n, c); |
||||
|
||||
int ddelta = dend - dstart; |
||||
dstart = max(dstart, 0); |
||||
dend = min(dend, inp_depth); |
||||
int ydelta = yend - ystart; |
||||
ystart = max(ystart, 0); |
||||
yend = min(yend, inp_height); |
||||
int8_t *dstData = &dst->ptr<int8_t>(n, c, d0)[y0 * width]; |
||||
|
||||
int delta = std::min((int)(stripeEnd - ofs0), width - x0); |
||||
ofs0 += delta; |
||||
int x1 = x0 + delta; |
||||
|
||||
if( poolingType == MAX ) |
||||
for( ; x0 < x1; x0++ ) |
||||
{ |
||||
int xstart = x0 * stride_w - pad_l; |
||||
int xend = min(xstart + kernel_w, inp_width); |
||||
xstart = max(xstart, 0); |
||||
if (xstart >= xend || ystart >= yend) |
||||
{ |
||||
dstData[x0] = (int8_t)outZp; |
||||
continue; |
||||
} |
||||
#if CV_SIMD128 |
||||
if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width ) |
||||
{ |
||||
v_int8x16 max_val0 = v_setall_s8(-128); |
||||
if( yend - ystart == kernel_h ) |
||||
{ |
||||
const int8_t* srcData1 = srcData + ystart*inp_width + xstart; |
||||
if( stride_w == 1 ) |
||||
for (int k = 0; k < kernel_w*kernel_h; k++) |
||||
{ |
||||
int index = ofsptr[k]; |
||||
v_int8x16 v0 = v_load(srcData1 + index); |
||||
max_val0 = v_max(max_val0, v0); |
||||
} |
||||
else if( stride_w == 2 ) |
||||
for (int k = 0; k < kernel_w*kernel_h; k++) |
||||
{ |
||||
int index = ofsptr[k]; |
||||
v_int8x16 v0, dummy; |
||||
v_load_deinterleave(srcData1 + index, v0, dummy); |
||||
max_val0 = v_max(max_val0, v0); |
||||
} |
||||
else |
||||
for (int k = 0; k < kernel_w*kernel_h; k++) |
||||
{ |
||||
int index = ofsptr[k]; |
||||
v_int8x16 v0(srcData1[index], srcData1[index + stride_w], |
||||
srcData1[index + stride_w*2], srcData1[index + stride_w*3], |
||||
srcData1[index + stride_w*4], srcData1[index + stride_w*5], |
||||
srcData1[index + stride_w*6], srcData1[index + stride_w*7], |
||||
srcData1[index + stride_w*8], srcData1[index + stride_w*9], |
||||
srcData1[index + stride_w*10], srcData1[index + stride_w*11], |
||||
srcData1[index + stride_w*12], srcData1[index + stride_w*13], |
||||
srcData1[index + stride_w*14], srcData1[index + stride_w*15]); |
||||
max_val0 = v_max(max_val0, v0); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (int y = ystart; y < yend; ++y) |
||||
{ |
||||
for (int x = xstart; x < xend; ++x) |
||||
{ |
||||
const int index = y * inp_width + x; |
||||
v_int8x16 v0(srcData[index], srcData[index + stride_w], |
||||
srcData[index + stride_w*2], srcData[index + stride_w*3], |
||||
srcData[index + stride_w*4], srcData[index + stride_w*5], |
||||
srcData[index + stride_w*6], srcData[index + stride_w*7], |
||||
srcData[index + stride_w*8], srcData[index + stride_w*9], |
||||
srcData[index + stride_w*10], srcData[index + stride_w*11], |
||||
srcData[index + stride_w*12], srcData[index + stride_w*13], |
||||
srcData[index + stride_w*14], srcData[index + stride_w*15]); |
||||
max_val0 = v_max(max_val0, v0); |
||||
} |
||||
} |
||||
} |
||||
v_store(dstData + x0, max_val0); |
||||
x0 += 15; |
||||
} |
||||
else |
||||
#else |
||||
CV_UNUSED(isPool2D); |
||||
#endif |
||||
if( isPool1D ) |
||||
{ |
||||
const int8_t* first = srcData + xstart; |
||||
const int8_t* last = srcData + xend; |
||||
const int8_t* max_elem = std::max_element(first, last); |
||||
if (max_elem != last) |
||||
dstData[x0] = *max_elem; |
||||
} |
||||
else |
||||
{ |
||||
int8_t max_val = -128; |
||||
for (int d = dstart; d < dend; ++d) { |
||||
for (int y = ystart; y < yend; ++y) { |
||||
for (int x = xstart; x < xend; ++x) { |
||||
const int index = d * inp_width * inp_height + y * inp_width + x; |
||||
int8_t val = srcData[index]; |
||||
max_val = std::max(max_val, val); |
||||
} |
||||
} |
||||
} |
||||
dstData[x0] = max_val; |
||||
} |
||||
} |
||||
else if (poolingType == AVE || poolingType == SUM) |
||||
{ |
||||
for( ; x0 < x1; ++x0) |
||||
{ |
||||
int xstart = x0 * stride_w - pad_l; |
||||
int xend = min(xstart + kernel_w, inp_width + pad_r); |
||||
int xdelta = xend - xstart; |
||||
xstart = max(xstart, 0); |
||||
xend = min(xend, inp_width); |
||||
|
||||
int real_kernel_area = (dend - dstart) * (yend - ystart) * (xend - xstart); |
||||
int padded_kernel_area = xdelta * ydelta * ddelta; |
||||
int kernel_area = avePoolPaddedArea ? padded_kernel_area : real_kernel_area; |
||||
|
||||
int bias = (avePoolPaddedArea ? (padded_kernel_area - real_kernel_area) * inpZp : 0) |
||||
- (inpZp * kernel_area); |
||||
float inv_kernel_area = poolingType == AVE ? multiplier / kernel_area : multiplier; |
||||
#if CV_SIMD128 |
||||
if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width ) |
||||
{ |
||||
v_int32x4 sum_val0 = v_setall_s32(bias), sum_val1 = v_setall_s32(bias), |
||||
sum_val2 = v_setall_s32(bias), sum_val3 = v_setall_s32(bias), |
||||
voutzp = v_setall_s32(outZp); |
||||
v_float32x4 ikarea = v_setall_f32(inv_kernel_area); |
||||
|
||||
for (int y = ystart; y < yend; ++y) |
||||
{ |
||||
for (int x = xstart; x < xend; ++x) |
||||
{ |
||||
const int index = y * inp_width + x; |
||||
v_int32x4 v0((int)srcData[index], (int)srcData[index + stride_w], |
||||
(int)srcData[index + stride_w*2], (int)srcData[index + stride_w*3]); |
||||
v_int32x4 v1((int)srcData[index + stride_w*4], (int)srcData[index + stride_w*5], |
||||
(int)srcData[index + stride_w*6], (int)srcData[index + stride_w*7]); |
||||
v_int32x4 v2((int)srcData[index + stride_w*8], (int)srcData[index + stride_w*9], |
||||
(int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]); |
||||
v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13], |
||||
(int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]); |
||||
sum_val0 += v0; |
||||
sum_val1 += v1; |
||||
sum_val2 += v2; |
||||
sum_val3 += v3; |
||||
} |
||||
} |
||||
|
||||
sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp; |
||||
sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp; |
||||
sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp; |
||||
sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp; |
||||
|
||||
v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3))); |
||||
x0 += 15; |
||||
} |
||||
else |
||||
#endif |
||||
if( isPool1D ) |
||||
{ |
||||
const int8_t* first = srcData + xstart; |
||||
const int8_t* last = srcData + xend; |
||||
int sum_val = bias + std::accumulate(first, last, 0); |
||||
dstData[x0] = saturate_cast<int8_t>(outZp + std::round(sum_val*inv_kernel_area)); |
||||
} |
||||
else |
||||
{ |
||||
int sum_val = bias; |
||||
for (int d = dstart; d < dend; ++d) { |
||||
for (int y = ystart; y < yend; ++y) { |
||||
for (int x = xstart; x < xend; ++x) { |
||||
const int index = d * inp_width * inp_height + y * inp_width + x; |
||||
int8_t val = srcData[index]; |
||||
sum_val += (int)val; |
||||
} |
||||
} |
||||
} |
||||
dstData[x0] = saturate_cast<int8_t>(outZp + std::round(sum_val*inv_kernel_area)); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
}; |
||||
|
||||
void maxPooling(Mat &src, Mat &dst) |
||||
{ |
||||
const int nstripes = getNumThreads(); |
||||
Mat rois; |
||||
PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, |
||||
spatialScale, multiplier, input_zp, output_zp, nstripes); |
||||
} |
||||
|
||||
void avePooling(Mat &src, Mat &dst) |
||||
{ |
||||
const int nstripes = getNumThreads(); |
||||
Mat rois; |
||||
PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, |
||||
spatialScale, multiplier, input_zp, output_zp, nstripes); |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &internals) const CV_OVERRIDE |
||||
{ |
||||
CV_Assert(inputs.size() != 0); |
||||
|
||||
bool isPool1D = inputs[0].size() == 3; |
||||
std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end()); |
||||
std::vector<int> outShape(inputs[0].begin(), inputs[0].begin() + 2); |
||||
|
||||
std::vector<size_t> local_kernel; |
||||
if (globalPooling) { |
||||
for (int i = 0; i < inpShape.size(); i++) { |
||||
int idx = isGlobalPooling.size() - inpShape.size() + i; |
||||
local_kernel.push_back(isGlobalPooling[idx] ? inpShape[i] : kernel_size[idx]); |
||||
} |
||||
} else { |
||||
local_kernel = kernel_size; |
||||
} |
||||
|
||||
if (hasDynamicShapes && !shapesInitialized) |
||||
{ |
||||
//Just copy input shapes for width and height to prevent errors on loading stage
|
||||
for (int i = 0; i < inpShape.size(); i++) |
||||
outShape.push_back(inpShape[i]); |
||||
} |
||||
else if (padMode.empty()) |
||||
{ |
||||
int addedDims = isPool1D? inpShape.size() : local_kernel.size(); |
||||
for (int i = 0; i < addedDims; i++) { |
||||
float dst = (float) (inpShape[i] + pads_begin[i] + pads_end[i] - local_kernel[i]) / strides[i]; |
||||
outShape.push_back(1 + (ceilMode ? ceil(dst) : floor(dst))); |
||||
} |
||||
|
||||
// If we have padding, ensure that the last pooling starts strictly
|
||||
// inside the image (instead of at the padding); otherwise clip the last.
|
||||
for (int i = 0; i < addedDims; i++) { |
||||
if (pads_end[i] && (outShape[2 + i] - 1) * strides[i] >= inpShape[i] + pads_end[i]) { |
||||
--outShape[2 + i]; |
||||
CV_Assert((outShape[2 + i] - 1) * strides[i] < inpShape[i] + pads_end[i]); |
||||
} |
||||
} |
||||
} |
||||
else { |
||||
getConvPoolOutParams(inpShape, local_kernel, strides, padMode, |
||||
std::vector<size_t>(local_kernel.size(), 1), outShape); |
||||
} |
||||
|
||||
outputs.assign(1, outShape); |
||||
return false; |
||||
} |
||||
|
||||
bool updateMemoryShapes(const std::vector<MatShape> &inputs) CV_OVERRIDE |
||||
{ |
||||
int dims = inputs[0].size(); |
||||
CV_Assert(inputs[0][dims - 1] > 0 && inputs[0][dims - 2] > 0); |
||||
shapesInitialized = true; |
||||
return true; |
||||
} |
||||
|
||||
virtual int64 getFLOPS(const std::vector<MatShape> &inputs, |
||||
const std::vector<MatShape> &outputs) const CV_OVERRIDE |
||||
{ |
||||
CV_UNUSED(inputs); // suppress unused variable warning
|
||||
long flops = 0; |
||||
bool isPool1D = inputs[0].size() == 3; |
||||
size_t karea = std::accumulate(kernel_size.begin(), isPool1D? kernel_size.begin() + 1 : kernel_size.end(), |
||||
1, std::multiplies<size_t>()); |
||||
for(int i = 0; i < outputs.size(); i++) |
||||
{ |
||||
if (type == MAX) |
||||
{ |
||||
if (i%2 == 0) |
||||
flops += total(outputs[i])*karea; |
||||
} |
||||
else |
||||
{ |
||||
flops += total(outputs[i])*(karea + 1); |
||||
} |
||||
} |
||||
return flops; |
||||
} |
||||
private: |
||||
enum Type |
||||
{ |
||||
MAX, |
||||
AVE, |
||||
STOCHASTIC, |
||||
SUM, |
||||
ROI, // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
|
||||
PSROI // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
|
||||
}; |
||||
bool hasDynamicShapes; |
||||
bool shapesInitialized; |
||||
float multiplier; |
||||
}; |
||||
|
||||
Ptr<PoolingLayerInt8> PoolingLayerInt8::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<PoolingLayerInt8>(new PoolingLayerInt8Impl(params)); |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,157 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../precomp.hpp" |
||||
#include "layers_common.hpp" |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
|
||||
class QuantizeLayerImpl CV_FINAL : public QuantizeLayer |
||||
{ |
||||
public: |
||||
QuantizeLayerImpl(const LayerParams& params) |
||||
{ |
||||
scale = params.get<float>("scales", 1.0f); |
||||
zeropoint = params.get<int>("zeropoints", 0); |
||||
setParamsFrom(params); |
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
return backendId == DNN_BACKEND_OPENCV; |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &internals) const CV_OVERRIDE |
||||
{ |
||||
CV_Assert(inputs.size() == 1); |
||||
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); |
||||
return false; |
||||
} |
||||
|
||||
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE |
||||
{ |
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
} |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) |
||||
{ |
||||
std::vector<UMat> inputs, outputs; |
||||
inputs_.getUMatVector(inputs); |
||||
outputs_.getUMatVector(outputs); |
||||
|
||||
if (inputs_.depth() == CV_16S) |
||||
{ |
||||
UMat inputFp32(shape(inputs[0]), CV_32F); |
||||
convertFp16(inputs[0], inputFp32); |
||||
inputFp32.copyTo(inputs[0]); |
||||
} |
||||
|
||||
inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint); |
||||
return true; |
||||
} |
||||
#endif |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); |
||||
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), |
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr)) |
||||
|
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
|
||||
inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint); |
||||
} |
||||
}; |
||||
|
||||
class DequantizeLayerImpl CV_FINAL : public DequantizeLayer |
||||
{ |
||||
public: |
||||
DequantizeLayerImpl(const LayerParams& params) |
||||
{ |
||||
scale = params.get<float>("scales", 1.0f); |
||||
zeropoint = params.get<int>("zeropoints", 0); |
||||
setParamsFrom(params); |
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
return backendId == DNN_BACKEND_OPENCV; |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &internals) const CV_OVERRIDE |
||||
{ |
||||
CV_Assert(inputs.size() == 1); |
||||
Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); |
||||
return false; |
||||
} |
||||
|
||||
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE |
||||
{ |
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
} |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) |
||||
{ |
||||
std::vector<UMat> inputs, outputs; |
||||
inputs_.getUMatVector(inputs); |
||||
outputs_.getUMatVector(outputs); |
||||
|
||||
UMat outputFp32(shape(outputs[0]), CV_32F); |
||||
inputs[0].convertTo(outputFp32, CV_32F, scale, -(scale*zeropoint)); |
||||
|
||||
if (outputs_.depth() == CV_16S) |
||||
convertFp16(outputFp32, outputs[0]); |
||||
else |
||||
outputFp32.copyTo(outputs[0]); |
||||
return true; |
||||
} |
||||
#endif |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); |
||||
|
||||
CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), |
||||
forward_ocl(inputs_arr, outputs_arr, internals_arr)) |
||||
|
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
|
||||
inputs[0].convertTo(outputs[0], CV_32F, scale, -(scale*zeropoint)); |
||||
} |
||||
}; |
||||
|
||||
Ptr<QuantizeLayer> QuantizeLayer::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<QuantizeLayer>(new QuantizeLayerImpl(params)); |
||||
} |
||||
|
||||
Ptr<DequantizeLayer> DequantizeLayer::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<DequantizeLayer>(new DequantizeLayerImpl(params)); |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,211 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../precomp.hpp" |
||||
#include "layers_common.hpp" |
||||
#include <opencv2/imgproc.hpp> |
||||
#include <opencv2/dnn/shape_utils.hpp> |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
|
||||
class ScaleLayerInt8Impl CV_FINAL : public ScaleLayerInt8 |
||||
{ |
||||
public: |
||||
Mat weights, bias; |
||||
ScaleLayerInt8Impl(const LayerParams& params) |
||||
{ |
||||
setParamsFrom(params); |
||||
hasBias = params.get<bool>("bias_term", false); |
||||
axis = params.get<int>("axis", 1); |
||||
hasWeights = false; |
||||
|
||||
output_sc = params.get<float>("scales"); |
||||
output_zp = params.get<int>("zeropoints"); |
||||
|
||||
DictValue inpSc = params.get("input_scales"); |
||||
DictValue inpZp = params.get("input_zeropoints"); |
||||
|
||||
for (int i = 0; i < inpSc.size(); i++) |
||||
{ |
||||
inp_sc.push_back(inpSc.get<float>(i)); |
||||
inp_zp.push_back(inpZp.get<int>(i)); |
||||
} |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &internals) const CV_OVERRIDE |
||||
{ |
||||
outputs.assign(1, inputs[0]); |
||||
return true; |
||||
} |
||||
|
||||
virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE |
||||
{ |
||||
std::vector<Mat> inputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
hasWeights = blobs.size() == 2 || (blobs.size() <= 1 && !hasBias); |
||||
CV_Assert((inputs.size() == 2 && blobs.empty()) || blobs.size() == (int)hasWeights + (int)hasBias); |
||||
|
||||
if (!blobs.empty()) |
||||
{ |
||||
Mat w = hasWeights ? blobs[0] : Mat::ones(blobs[0].size(), CV_32F); |
||||
Mat b = hasBias ? blobs.back() : Mat::zeros(blobs.back().size(), CV_32F); |
||||
|
||||
w = w.reshape(1, 1); |
||||
b = b.reshape(1, 1); |
||||
|
||||
w.convertTo(weights, CV_32F, inp_sc[0]/output_sc); |
||||
addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F); |
||||
} |
||||
else |
||||
{ |
||||
// initialized during forward()
|
||||
weights = Mat(); bias = Mat(); |
||||
} |
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
return backendId == DNN_BACKEND_OPENCV; |
||||
} |
||||
|
||||
bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE |
||||
{ |
||||
Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>(); |
||||
if (!activ_int8.empty()) |
||||
{ |
||||
return activ_int8->blobs.empty(); |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); |
||||
|
||||
std::vector<Mat> inputs, outputs; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
|
||||
Mat &inpBlob = inputs[0]; |
||||
Mat &outBlob = outputs[0]; |
||||
|
||||
if (blobs.empty()) |
||||
{ |
||||
CV_Assert(inp_sc.size() == 2 && inp_zp.size() == 2); |
||||
Mat inp_dequantized, w, b; |
||||
inputs[1].reshape(1, 1).convertTo(inp_dequantized, CV_32F, inp_sc[1], -(inp_sc[1]*inp_zp[1])); |
||||
w = hasWeights ? inp_dequantized : Mat::ones(inp_dequantized.size(), CV_32F); |
||||
b = hasBias ? inp_dequantized : Mat::zeros(inp_dequantized.size(), CV_32F); |
||||
|
||||
w.convertTo(weights, CV_32F, inp_sc[0]/output_sc); |
||||
addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F); |
||||
} |
||||
|
||||
MatShape inpShape = shape(inpBlob); |
||||
const int numWeights = weights.total(); |
||||
CV_Assert(numWeights != 0); |
||||
CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs"); |
||||
|
||||
int endAxis; |
||||
for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis) |
||||
{ |
||||
if (total(inpShape, axis, endAxis) == numWeights) |
||||
break; |
||||
} |
||||
CV_Assert(total(inpShape, axis, endAxis) == numWeights); |
||||
CV_CheckTypeEQ(inpBlob.type(), CV_8SC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_8SC1, ""); |
||||
|
||||
int numSlices = total(inpShape, 0, axis); |
||||
int8_t* inpData = (int8_t*)inpBlob.data; |
||||
int8_t* outData = (int8_t*)outBlob.data; |
||||
|
||||
if (endAxis != inpBlob.dims) |
||||
{ |
||||
float* weightsData = (float*)weights.data; |
||||
float* biasesData = (float*)bias.data; |
||||
int spatialSize = total(inpShape, endAxis); // spatialSize != 1
|
||||
for (int i = 0; i < numSlices; ++i) |
||||
{ |
||||
for (int j = 0; j < numWeights; ++j) |
||||
{ |
||||
float w = weightsData[j]; |
||||
float b = biasesData[j]; |
||||
Mat inpSlice(1, spatialSize, CV_8S, inpData); |
||||
Mat outSlice(1, spatialSize, CV_8S, outData); |
||||
inpSlice.convertTo(outSlice, CV_8S, w, b); |
||||
inpData += spatialSize; |
||||
outData += spatialSize; |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (int i = 0; i < numSlices; ++i) |
||||
{ |
||||
Mat inpSlice(1, numWeights, CV_8S, inpData); |
||||
Mat outSlice(1, numWeights, CV_8S, outData); |
||||
|
||||
multiply(inpSlice, weights, outSlice, 1.0, CV_8S); |
||||
add(outSlice, bias, outSlice, Mat(), CV_8S); |
||||
|
||||
inpData += numWeights; |
||||
outData += numWeights; |
||||
} |
||||
} |
||||
} |
||||
|
||||
void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE |
||||
{ |
||||
scale = (hasWeights && !blobs.empty()) ? blobs[0] : Mat(); |
||||
shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat(); |
||||
} |
||||
|
||||
void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE |
||||
{ |
||||
scale = output_sc; |
||||
zeropoint = output_zp; |
||||
} |
||||
|
||||
virtual int64 getFLOPS(const std::vector<MatShape> &inputs, |
||||
const std::vector<MatShape> &outputs) const CV_OVERRIDE |
||||
{ |
||||
CV_UNUSED(outputs); // suppress unused variable warning
|
||||
long flops = 0; |
||||
for(int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
flops += 2*total(inputs[i]); |
||||
} |
||||
return flops; |
||||
} |
||||
|
||||
private: |
||||
bool hasWeights; |
||||
std::vector<float> inp_sc; |
||||
std::vector<int> inp_zp; |
||||
}; |
||||
|
||||
|
||||
Ptr<ScaleLayerInt8> ScaleLayerInt8::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<ScaleLayerInt8>(new ScaleLayerInt8Impl(params)); |
||||
} |
||||
|
||||
Ptr<Layer> ShiftLayerInt8::create(const LayerParams& params) |
||||
{ |
||||
LayerParams scaleParams = params; |
||||
scaleParams.type = "ScaleInt8"; |
||||
scaleParams.set("bias_term", true); |
||||
scaleParams.set("axis", 0); |
||||
return Ptr<ScaleLayerInt8>(new ScaleLayerInt8Impl(scaleParams)); |
||||
} |
||||
|
||||
} // namespace dnn
|
||||
} // namespace cv
|
@ -0,0 +1,176 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../precomp.hpp" |
||||
#include "layers_common.hpp" |
||||
|
||||
#include <algorithm> |
||||
#include <stdlib.h> |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace dnn |
||||
{ |
||||
|
||||
class SoftMaxLayerInt8Impl CV_FINAL : public SoftmaxLayerInt8 |
||||
{ |
||||
public: |
||||
|
||||
SoftMaxLayerInt8Impl(const LayerParams& params) |
||||
{ |
||||
axisRaw = params.get<int>("axis", 1); |
||||
logSoftMax = params.get<bool>("log_softmax", false); |
||||
output_sc = params.get<float>("scales"); |
||||
output_zp = params.get<int>("zeropoints"); |
||||
setParamsFrom(params); |
||||
} |
||||
|
||||
bool getMemoryShapes(const std::vector<MatShape> &inputs, |
||||
const int requiredOutputs, |
||||
std::vector<MatShape> &outputs, |
||||
std::vector<MatShape> &internals) const CV_OVERRIDE |
||||
{ |
||||
bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals); |
||||
MatShape shape = inputs[0]; |
||||
int cAxis = normalize_axis(axisRaw, shape.size()); |
||||
shape[cAxis] = 1; |
||||
internals.assign(1, shape); |
||||
return inplace; |
||||
} |
||||
|
||||
virtual bool supportBackend(int backendId) CV_OVERRIDE |
||||
{ |
||||
return backendId == DNN_BACKEND_OPENCV; |
||||
} |
||||
|
||||
virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE |
||||
{ |
||||
Ptr<DequantizeLayer> dequantize_layer = top.dynamicCast<DequantizeLayer>(); |
||||
return !dequantize_layer.empty() && preferableTarget != DNN_TARGET_OPENCL_FP16; |
||||
} |
||||
|
||||
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE |
||||
{ |
||||
CV_TRACE_FUNCTION(); |
||||
CV_TRACE_ARG_VALUE(name, "name", name.c_str()); |
||||
|
||||
std::vector<Mat> inputs, outputs, internals; |
||||
inputs_arr.getMatVector(inputs); |
||||
outputs_arr.getMatVector(outputs); |
||||
internals_arr.getMatVector(internals); |
||||
|
||||
const Mat &src = inputs[0]; |
||||
Mat &dst = outputs[0]; |
||||
|
||||
int axis = normalize_axis(axisRaw, src.dims); |
||||
size_t outerSize = src.total(0, axis), channels = src.size[axis], |
||||
innerSize = src.total(axis + 1); |
||||
|
||||
CV_Assert(src.type() == CV_8S && (dst.type() == CV_8S || dst.type() == CV_32F)); |
||||
CV_Assert(src.isContinuous() && dst.isContinuous()); |
||||
|
||||
size_t outerStep = src.total(axis); |
||||
size_t cnStep = src.total(axis + 1); |
||||
const int8_t *srcPtr = src.ptr<int8_t>(); |
||||
const float *expPtr = blobs[0].ptr<float>(); |
||||
|
||||
if (dst.type() == CV_32F) |
||||
{ |
||||
float *dstPtr = dst.ptr<float>(); |
||||
for (size_t outerDim = 0; outerDim < outerSize; outerDim++) |
||||
{ |
||||
size_t srcOffset = outerDim * outerStep; |
||||
std::vector<float> expSum(innerSize, 0.f); |
||||
|
||||
// sum exp along axis
|
||||
for (size_t cnDim = 0; cnDim < channels; cnDim++) |
||||
{ |
||||
const int offset = srcOffset + cnDim * cnStep; |
||||
for (size_t i = 0; i < innerSize; i++) |
||||
expSum[i] += expPtr[srcPtr[offset + i] + 128]; |
||||
} |
||||
|
||||
// divide by computed sum
|
||||
for (size_t cnDim = 0; cnDim < channels; cnDim++) |
||||
{ |
||||
const int offset = srcOffset + cnDim * cnStep; |
||||
for (size_t i = 0; i < innerSize; i++) |
||||
dstPtr[offset + i] = expPtr[srcPtr[offset + i] + 128]/expSum[i]; |
||||
} |
||||
|
||||
if (logSoftMax) |
||||
{ |
||||
for (size_t cnDim = 0; cnDim < channels; cnDim++) |
||||
{ |
||||
const int offset = srcOffset + cnDim * cnStep; |
||||
for (size_t i = 0; i < innerSize; i++) |
||||
dstPtr[offset + i] = log(dstPtr[offset + i]); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
const float inv_scale = 1.f/output_sc; |
||||
int8_t *dstPtr = dst.ptr<int8_t>(); |
||||
for (size_t outerDim = 0; outerDim < outerSize; outerDim++) |
||||
{ |
||||
size_t srcOffset = outerDim * outerStep; |
||||
std::vector<float> expSum(innerSize, 0.f); |
||||
|
||||
// sum exp along axis
|
||||
for (size_t cnDim = 0; cnDim < channels; cnDim++) |
||||
{ |
||||
const int offset = srcOffset + cnDim * cnStep; |
||||
for (size_t i = 0; i < innerSize; i++) |
||||
expSum[i] += expPtr[srcPtr[offset + i] + 128]; |
||||
} |
||||
|
||||
// divide by computed sum and quantize to int8
|
||||
if (logSoftMax) |
||||
{ |
||||
for (size_t cnDim = 0; cnDim < channels; cnDim++) |
||||
{ |
||||
const int offset = srcOffset + cnDim * cnStep; |
||||
for (size_t i = 0; i < innerSize; i++) |
||||
dstPtr[offset + i] = saturate_cast<int8_t>(output_zp + std::round(inv_scale*log(expPtr[srcPtr[offset + i] + 128]/expSum[i]))); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (size_t cnDim = 0; cnDim < channels; cnDim++) |
||||
{ |
||||
const int offset = srcOffset + cnDim * cnStep; |
||||
for (size_t i = 0; i < innerSize; i++) |
||||
dstPtr[offset + i] = saturate_cast<int8_t>(output_zp + std::round(inv_scale*(expPtr[srcPtr[offset + i] + 128]/expSum[i]))); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
int64 getFLOPS(const std::vector<MatShape> &inputs, |
||||
const std::vector<MatShape> &outputs) const CV_OVERRIDE |
||||
{ |
||||
CV_UNUSED(outputs); // suppress unused variable warning
|
||||
int64 flops = 0; |
||||
|
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
flops += 4*total(inputs[i]); |
||||
} |
||||
|
||||
return flops; |
||||
} |
||||
|
||||
int axisRaw; |
||||
}; |
||||
|
||||
Ptr<SoftmaxLayerInt8> SoftmaxLayerInt8::create(const LayerParams& params) |
||||
{ |
||||
return Ptr<SoftmaxLayerInt8>(new SoftMaxLayerInt8Impl(params)); |
||||
} |
||||
|
||||
} |
||||
} |
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue