mirror of https://github.com/opencv/opencv.git
Merge pull request #21910 from zihaomu:fast_conv_ARM
DNN: Accelerating convolution * Fast Conv of ARM, X86 and universal intrinsics. * improve code style. * error fixed. * improve the License * optimize memory allocated and Adjust the threshold. * change FasterRCNN_vgg16 to 2GB memory.pull/22135/head
parent
b91f173680
commit
59b870a87a
11 changed files with 3286 additions and 12 deletions
@ -0,0 +1,385 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
|
||||
// Here is the original license:
|
||||
/*
|
||||
This file is a part of ficus language project. |
||||
See ficus/LICENSE for the licensing terms |
||||
*/ |
||||
|
||||
#include "../../precomp.hpp" |
||||
#include "fast_convolution.hpp" |
||||
|
||||
namespace cv { namespace dnn { |
||||
|
||||
static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab, |
||||
float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left, |
||||
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop, |
||||
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3) |
||||
{ |
||||
#ifdef CV_SIMD128 |
||||
v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval); |
||||
|
||||
v_float32x4 w0 = v_setall_f32( |
||||
0.f), w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0; |
||||
if (useSIMD) |
||||
{ |
||||
vbias = v_setall_f32(biasval); |
||||
if (is3x3) |
||||
{ |
||||
w0 = v_setall_f32(weights[0]); |
||||
w1 = v_setall_f32(weights[1]); |
||||
w2 = v_setall_f32(weights[2]); |
||||
w3 = v_setall_f32(weights[3]); |
||||
w4 = v_setall_f32(weights[4]); |
||||
w5 = v_setall_f32(weights[5]); |
||||
w6 = v_setall_f32(weights[6]); |
||||
w7 = v_setall_f32(weights[7]); |
||||
w8 = v_setall_f32(weights[8]); |
||||
} |
||||
} |
||||
#endif |
||||
int dy0 = 1; |
||||
for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0) |
||||
{ |
||||
#ifdef CV_SIMD128 |
||||
dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1 |
||||
? 3 : 1; |
||||
#endif |
||||
int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0; |
||||
int yi_ = y0 * stride_y - pad_top; |
||||
|
||||
for (;;) |
||||
{ |
||||
float s_0, s_1, s_2; |
||||
if (dy0 == 3) |
||||
{ |
||||
for (; x0 < x1; x0++) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
s_0 = s_1 = s_2 = biasval; |
||||
for (int k = 0; k < ksize; k++) |
||||
{ |
||||
int dy = yxtab[k * 2]; |
||||
int yi = yi_ + dy; |
||||
int xi = xi_ + yxtab[k * 2 + 1]; |
||||
float w = weights[k]; |
||||
|
||||
if ((unsigned) xi < (unsigned) Wi) |
||||
{ |
||||
s_0 += inptr[yi * Wi + xi] * w; |
||||
s_1 += inptr[(yi + 1) * Wi + xi] * w; |
||||
s_2 += inptr[(yi + 2) * Wi + xi] * w; |
||||
} |
||||
} |
||||
s_0 = std::min(std::max(s_0, minval), maxval); |
||||
s_1 = std::min(std::max(s_1, minval), maxval); |
||||
s_2 = std::min(std::max(s_2, minval), maxval); |
||||
outptr[x0] = s_0; |
||||
outptr[x0 + W0] = s_1; |
||||
outptr[x0 + W0 * 2] = s_2; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (; x0 < x1; x0++) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
s_0 = biasval; |
||||
for (int k = 0; k < ksize; k++) { |
||||
int dy = yxtab[k * 2]; |
||||
int yi = yi_ + dy; |
||||
int xi = xi_ + yxtab[k * 2 + 1]; |
||||
float w = weights[k]; |
||||
if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi)) |
||||
s_0 += inptr[yi * Wi + xi] * w; |
||||
} |
||||
s_0 = std::min(std::max(s_0, minval), maxval); |
||||
outptr[x0] = s_0; |
||||
} |
||||
} |
||||
if (x0 == W0) |
||||
break; |
||||
x1 = inner_xright; |
||||
#ifdef CV_SIMD128 |
||||
if (useSIMD) |
||||
{ |
||||
if (is3x3) |
||||
{ |
||||
if (dy0 == 3) |
||||
{ |
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_; |
||||
|
||||
v_float32x4 s0, s1, s2; |
||||
v_float32x4 x00 = v_load(inptr_xi); |
||||
v_float32x4 x01 = v_load(inptr_xi + 1); |
||||
v_float32x4 x02 = v_load(inptr_xi + 2); |
||||
|
||||
v_float32x4 x10 = v_load(inptr_xi + Wi); |
||||
v_float32x4 x11 = v_load(inptr_xi + Wi + 1); |
||||
v_float32x4 x12 = v_load(inptr_xi + Wi + 2); |
||||
|
||||
v_float32x4 x20 = v_load(inptr_xi + Wi * 2); |
||||
v_float32x4 x21 = v_load(inptr_xi + Wi * 2 + 1); |
||||
v_float32x4 x22 = v_load(inptr_xi + Wi * 2 + 2); |
||||
|
||||
v_float32x4 x30 = v_load(inptr_xi + Wi * 3); |
||||
v_float32x4 x31 = v_load(inptr_xi + Wi * 3 + 1); |
||||
v_float32x4 x32 = v_load(inptr_xi + Wi * 3 + 2); |
||||
|
||||
v_float32x4 x40 = v_load(inptr_xi + Wi * 4); |
||||
v_float32x4 x41 = v_load(inptr_xi + Wi * 4 + 1); |
||||
v_float32x4 x42 = v_load(inptr_xi + Wi * 4 + 2); |
||||
|
||||
s0 = v_fma(x00, w0, vbias); |
||||
s1 = v_fma(x10, w0, vbias); |
||||
s2 = v_fma(x20, w0, vbias); |
||||
|
||||
s0 = v_fma(x01, w1, s0); |
||||
s1 = v_fma(x11, w1, s1); |
||||
s2 = v_fma(x21, w1, s2); |
||||
|
||||
s0 = v_fma(x02, w2, s0); |
||||
s1 = v_fma(x12, w2, s1); |
||||
s2 = v_fma(x22, w2, s2); |
||||
|
||||
s0 = v_fma(x10, w3, s0); |
||||
s1 = v_fma(x20, w3, s1); |
||||
s2 = v_fma(x30, w3, s2); |
||||
|
||||
s0 = v_fma(x11, w4, s0); |
||||
s1 = v_fma(x21, w4, s1); |
||||
s2 = v_fma(x31, w4, s2); |
||||
|
||||
s0 = v_fma(x12, w5, s0); |
||||
s1 = v_fma(x22, w5, s1); |
||||
s2 = v_fma(x32, w5, s2); |
||||
|
||||
s0 = v_fma(x20, w6, s0); |
||||
s1 = v_fma(x30, w6, s1); |
||||
s2 = v_fma(x40, w6, s2); |
||||
|
||||
s0 = v_fma(x21, w7, s0); |
||||
s1 = v_fma(x31, w7, s1); |
||||
s2 = v_fma(x41, w7, s2); |
||||
|
||||
s0 = v_fma(x22, w8, s0); |
||||
s1 = v_fma(x32, w8, s1); |
||||
s2 = v_fma(x42, w8, s2); |
||||
|
||||
if (ifMinMaxAct) |
||||
{ |
||||
s0 = v_min(v_max(s0, vminval), vmaxval); |
||||
s1 = v_min(v_max(s1, vminval), vmaxval); |
||||
s2 = v_min(v_max(s2, vminval), vmaxval); |
||||
} |
||||
|
||||
v_store(outptr + x0, s0); |
||||
v_store(outptr + W0 + x0, s1); |
||||
v_store(outptr + W0 * 2 + x0, s2); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_; |
||||
v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias); |
||||
v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1; |
||||
v_float32x4 s2 = v_load(inptr_xi + ofstab[2]) * w2; |
||||
|
||||
s0 = v_fma(v_load(inptr_xi + ofstab[3]), w3, s0); |
||||
s1 = v_fma(v_load(inptr_xi + ofstab[4]), w4, s1); |
||||
s2 = v_fma(v_load(inptr_xi + ofstab[5]), w5, s2); |
||||
|
||||
s0 = v_fma(v_load(inptr_xi + ofstab[6]), w6, s0); |
||||
s1 = v_fma(v_load(inptr_xi + ofstab[7]), w7, s1); |
||||
s2 = v_fma(v_load(inptr_xi + ofstab[8]), w8, s2); |
||||
|
||||
s0 = s0 + s1 + s2; |
||||
if (ifMinMaxAct) |
||||
s0 = v_min(v_max(s0, vminval), vmaxval); |
||||
v_store(outptr + x0, s0); |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left, k = 0; |
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_; |
||||
v_float32x4 s0 = vbias; |
||||
for (; k <= ksize - 4; k += 4) |
||||
{ |
||||
v_float32x4 v0 = v_load(inptr_xi + ofstab[k]); |
||||
v_float32x4 v1 = v_load(inptr_xi + ofstab[k + 1]); |
||||
v_float32x4 v2 = v_load(inptr_xi + ofstab[k + 2]); |
||||
v_float32x4 v3 = v_load(inptr_xi + ofstab[k + 3]); |
||||
|
||||
v_float32x4 ww0 = v_setall_f32(weights[k]); |
||||
v_float32x4 ww1 = v_setall_f32(weights[k+1]); |
||||
v_float32x4 ww2 = v_setall_f32(weights[k+2]); |
||||
v_float32x4 ww3 = v_setall_f32(weights[k+3]); |
||||
|
||||
s0 = v_fma(v0, ww0, s0); |
||||
s0 = v_fma(v1, ww1, s0); |
||||
s0 = v_fma(v2, ww2, s0); |
||||
s0 = v_fma(v3, ww3, s0); |
||||
} |
||||
for (; k < ksize; k++) |
||||
s0 = v_fma(v_load(inptr_xi + ofstab[k]), |
||||
v_setall_f32(weights[k]), s0); |
||||
if (ifMinMaxAct) |
||||
s0 = v_min(v_max(s0, vminval), vmaxval); |
||||
v_store(outptr + x0, s0); |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
if (dy0 == 3) |
||||
{ |
||||
for (; x0 < x1; x0++) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
const float *inptr_xi = inptr + W0 * yi_ + xi_; |
||||
s_0 = s_1 = s_2 = biasval; |
||||
for (int k = 0; k < ksize; k++) |
||||
{ |
||||
int inp_ofs = ofstab[k]; |
||||
float w = weights[k]; |
||||
s_0 += inptr_xi[inp_ofs] * w; |
||||
s_1 += inptr_xi[inp_ofs + Wi] * w; |
||||
s_2 += inptr_xi[inp_ofs + Wi * 2] * w; |
||||
} |
||||
if (ifMinMaxAct) |
||||
{ |
||||
s_0 = std::min(std::max(s_0, minval), maxval); |
||||
s_1 = std::min(std::max(s_1, minval), maxval); |
||||
s_2 = std::min(std::max(s_2, minval), maxval); |
||||
} |
||||
|
||||
outptr[x0] = s_0; |
||||
outptr[x0 + W0] = s_1; |
||||
outptr[x0 + W0 * 2] = s_2; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (; x0 < x1; x0++) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_; |
||||
s_0 = biasval; |
||||
for (int k = 0; k < ksize; k++) |
||||
{ |
||||
s_0 += inptr_xi[ofstab[k]] * weights[k]; |
||||
} |
||||
|
||||
if (ifMinMaxAct) |
||||
s_0 = std::min(std::max(s_0, minval), maxval); |
||||
outptr[x0] = s_0; |
||||
} |
||||
} |
||||
x1 = W0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) { |
||||
Mat input = _input.getMat(); |
||||
Mat output = _output.getMat(); |
||||
MatShape inputShape = shape(input); |
||||
MatShape outputShape = shape(output); |
||||
CV_Assert(inputShape.size() == 4 && outputShape.size() == 4); |
||||
|
||||
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
|
||||
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk; |
||||
int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups; |
||||
|
||||
const size_t inp_planesize = (size_t) Hi * Wi; |
||||
const size_t out_planesize = (size_t) H0 * W0; |
||||
|
||||
CV_Assert(ngroups > 1 && ngroups == K && ngroups == C); |
||||
|
||||
int stride_y = conv->stride_y, stride_x = conv->stride_x; |
||||
int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x; |
||||
|
||||
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom; |
||||
int pad_left = conv->pad_left, pad_right = conv->pad_right; |
||||
|
||||
int ksize = Hk * Wk, padded_ksize = ((ksize + FAST_VEC_NLANES - 1) / FAST_VEC_NLANES) * FAST_VEC_NLANES; |
||||
|
||||
const float *inp = input.ptr<float>(); |
||||
float *out = output.ptr<float>(); |
||||
|
||||
std::vector<int> ofstab_(3 * padded_ksize, 0); |
||||
int *ofstab = ofstab_.data(); |
||||
int *yxtab = ofstab + padded_ksize; |
||||
|
||||
for (int k = 0; k < padded_ksize; k++) |
||||
{ |
||||
int y = k < ksize ? k / Wk : 0; |
||||
int x = k < ksize ? k % Wk : 0; |
||||
int dy = y * dilation_y, dx = x * dilation_x; |
||||
yxtab[k * 2] = dy; |
||||
yxtab[k * 2 + 1] = dx; |
||||
ofstab[k] = dy * Wi + dx; |
||||
} |
||||
|
||||
const float *weights0 = conv->weightsBuf.data(), *bias = conv->biasBuf.data(); |
||||
int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3; |
||||
int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4; |
||||
|
||||
CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0)); |
||||
|
||||
inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x; |
||||
inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi; |
||||
inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y; |
||||
inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi; |
||||
|
||||
if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom) |
||||
{ |
||||
inner_xleft = W0; |
||||
inner_ytop = H0; |
||||
} |
||||
|
||||
inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0; |
||||
|
||||
bool useSIMD = stride_x == 1 && inner_xleft < W0; |
||||
bool is3x3 = Hk == 3 && Wk == 3; |
||||
|
||||
parallel_for_(Range(0, N * C), [&](const Range &r0) { |
||||
for (int nc = r0.start; nc < r0.end; nc++) |
||||
{ |
||||
int c = nc % C; |
||||
const float *inptr = inp + inp_planesize * nc; |
||||
float *outptr0 = out + out_planesize * nc; |
||||
|
||||
float biasval = bias[c]; |
||||
const float *weights = weights0 + c * padded_ksize; |
||||
|
||||
#if CV_TRY_AVX2 |
||||
if (conv->useAVX2) |
||||
opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize, |
||||
pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop, |
||||
inner_ybottom, ifMinMaxAct, useSIMD, is3x3); |
||||
else |
||||
#endif |
||||
depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize, |
||||
pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop, |
||||
inner_ybottom, ifMinMaxAct, useSIMD, is3x3); |
||||
|
||||
if (activ) |
||||
activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1); |
||||
} |
||||
}); |
||||
} |
||||
|
||||
}} // namespace cv::dnn
|
@ -0,0 +1,361 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "../../precomp.hpp" |
||||
#include "fast_convolution.hpp" |
||||
|
||||
namespace cv { |
||||
namespace opt_AVX2 |
||||
{ |
||||
#if CV_TRY_AVX2 |
||||
void convBlock_AVX2(int k, const float *a, const float *b, |
||||
float *c, int ldc, const float *bias, |
||||
float minval, float maxval, bool ifActiv) |
||||
{ |
||||
#if FAST_CONV_MR == 4 && FAST_CONV_NR == 24 |
||||
__m256 vminval = _mm256_set1_ps(minval), vmaxval = _mm256_set1_ps(maxval); |
||||
__m256 c0 = _mm256_set1_ps(bias[0]), c1 = c0, c2 = c0; |
||||
__m256 c3 = _mm256_set1_ps(bias[1]), c4 = c3, c5 = c3; |
||||
__m256 c6 = _mm256_set1_ps(bias[2]), c7 = c6, c8 = c6; |
||||
__m256 c9 = _mm256_set1_ps(bias[3]), c10 = c9, c11 = c9; |
||||
|
||||
__m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps(); |
||||
__m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps(); |
||||
|
||||
for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR) |
||||
{ |
||||
a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); |
||||
b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16); |
||||
|
||||
c0 = _mm256_fmadd_ps(b0, a0, c0); |
||||
c1 = _mm256_fmadd_ps(b1, a0, c1); |
||||
c2 = _mm256_fmadd_ps(b2, a0, c2); |
||||
|
||||
c3 = _mm256_fmadd_ps(b0, a1, c3); |
||||
a0 = _mm256_set1_ps(a[2]); |
||||
c4 = _mm256_fmadd_ps(b1, a1, c4); |
||||
c5 = _mm256_fmadd_ps(b2, a1, c5); |
||||
|
||||
c6 = _mm256_fmadd_ps(b0, a0, c6); |
||||
a1 = _mm256_set1_ps(a[3]); |
||||
c7 = _mm256_fmadd_ps(b1, a0, c7); |
||||
c8 = _mm256_fmadd_ps(b2, a0, c8); |
||||
|
||||
c9 = _mm256_fmadd_ps(b0, a1, c9); |
||||
c10 = _mm256_fmadd_ps(b1, a1, c10); |
||||
c11 = _mm256_fmadd_ps(b2, a1, c11); |
||||
} |
||||
|
||||
if (ifActiv) |
||||
{ |
||||
c0 = _mm256_min_ps(_mm256_max_ps(c0, vminval), vmaxval); |
||||
c1 = _mm256_min_ps(_mm256_max_ps(c1, vminval), vmaxval); |
||||
c2 = _mm256_min_ps(_mm256_max_ps(c2, vminval), vmaxval); |
||||
c3 = _mm256_min_ps(_mm256_max_ps(c3, vminval), vmaxval); |
||||
c4 = _mm256_min_ps(_mm256_max_ps(c4, vminval), vmaxval); |
||||
c5 = _mm256_min_ps(_mm256_max_ps(c5, vminval), vmaxval); |
||||
c6 = _mm256_min_ps(_mm256_max_ps(c6, vminval), vmaxval); |
||||
c7 = _mm256_min_ps(_mm256_max_ps(c7, vminval), vmaxval); |
||||
c8 = _mm256_min_ps(_mm256_max_ps(c8, vminval), vmaxval); |
||||
c9 = _mm256_min_ps(_mm256_max_ps(c9, vminval), vmaxval); |
||||
c10 = _mm256_min_ps(_mm256_max_ps(c10, vminval), vmaxval); |
||||
c11 = _mm256_min_ps(_mm256_max_ps(c11, vminval), vmaxval); |
||||
} |
||||
|
||||
_mm256_storeu_ps(c, c0); _mm256_storeu_ps(c+8, c1); _mm256_storeu_ps(c+16, c2); |
||||
_mm256_storeu_ps(c + ldc, c3); _mm256_storeu_ps(c + ldc + 8, c4); _mm256_storeu_ps(c + ldc + 16, c5); |
||||
_mm256_storeu_ps(c + ldc*2, c6); _mm256_storeu_ps(c + ldc*2 + 8, c7); _mm256_storeu_ps(c + ldc*2 + 16, c8); |
||||
_mm256_storeu_ps(c + ldc*3, c9); _mm256_storeu_ps(c + ldc*3 + 8, c10); _mm256_storeu_ps(c + ldc*3 + 16, c11); |
||||
_mm256_zeroupper(); |
||||
#else |
||||
#error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_AVX2." |
||||
#endif |
||||
} |
||||
|
||||
void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab, |
||||
float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left, |
||||
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop, |
||||
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3) |
||||
{ |
||||
const int VECSZ = 8; |
||||
__m256 vminval = _mm256_set1_ps(minval); |
||||
__m256 vmaxval = _mm256_set1_ps(maxval); |
||||
|
||||
__m256 w0 = _mm256_setzero_ps(), |
||||
w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0; |
||||
|
||||
if (useSIMD) |
||||
{ |
||||
vbias = _mm256_set1_ps(biasval); |
||||
if (is3x3) |
||||
{ |
||||
w0 = _mm256_set1_ps(weights[0]); |
||||
w1 = _mm256_set1_ps(weights[1]); |
||||
w2 = _mm256_set1_ps(weights[2]); |
||||
w3 = _mm256_set1_ps(weights[3]); |
||||
w4 = _mm256_set1_ps(weights[4]); |
||||
w5 = _mm256_set1_ps(weights[5]); |
||||
w6 = _mm256_set1_ps(weights[6]); |
||||
w7 = _mm256_set1_ps(weights[7]); |
||||
w8 = _mm256_set1_ps(weights[8]); |
||||
} |
||||
} |
||||
|
||||
int dy0 = 1; |
||||
for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0) |
||||
{ |
||||
dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1 |
||||
? 3 : 1; |
||||
|
||||
int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0; |
||||
int yi_ = y0 * stride_y - pad_top; |
||||
|
||||
for (;;) |
||||
{ |
||||
float s_0, s_1, s_2; |
||||
if (dy0 == 3) |
||||
{ |
||||
for (; x0 < x1; x0++) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
s_0 = s_1 = s_2 = biasval; |
||||
for (int k = 0; k < ksize; k++) |
||||
{ |
||||
int dy = yxtab[k * 2]; |
||||
int yi = yi_ + dy; |
||||
int xi = xi_ + yxtab[k * 2 + 1]; |
||||
float w = weights[k]; |
||||
|
||||
if ((unsigned) xi < (unsigned) Wi) |
||||
{ |
||||
s_0 += inptr[yi * Wi + xi] * w; |
||||
s_1 += inptr[(yi + 1) * Wi + xi] * w; |
||||
s_2 += inptr[(yi + 2) * Wi + xi] * w; |
||||
} |
||||
} |
||||
if (ifMinMaxAct) |
||||
{ |
||||
s_0 = std::min(std::max(s_0, minval), maxval); |
||||
s_1 = std::min(std::max(s_1, minval), maxval); |
||||
s_2 = std::min(std::max(s_2, minval), maxval); |
||||
} |
||||
|
||||
outptr[x0] = s_0; |
||||
outptr[x0 + W0] = s_1; |
||||
outptr[x0 + W0 * 2] = s_2; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (; x0 < x1; x0++) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
s_0 = biasval; |
||||
for (int k = 0; k < ksize; k++) { |
||||
int dy = yxtab[k * 2]; |
||||
int yi = yi_ + dy; |
||||
int xi = xi_ + yxtab[k * 2 + 1]; |
||||
float w = weights[k]; |
||||
if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi)) |
||||
s_0 += inptr[yi * Wi + xi] * w; |
||||
} |
||||
if (ifMinMaxAct) |
||||
s_0 = std::min(std::max(s_0, minval), maxval); |
||||
outptr[x0] = s_0; |
||||
} |
||||
} |
||||
if (x0 == W0) |
||||
break; |
||||
x1 = inner_xright; |
||||
|
||||
if (useSIMD) |
||||
{ |
||||
if (is3x3) |
||||
{ |
||||
if (dy0 == 3) |
||||
{ |
||||
for (; x0 <= x1 - VECSZ; x0 += VECSZ) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_; |
||||
|
||||
__m256 s0, s1, s2; |
||||
__m256 x00 = _mm256_loadu_ps(inptr_xi); |
||||
__m256 x01 = _mm256_loadu_ps(inptr_xi + 1); |
||||
__m256 x02 = _mm256_loadu_ps(inptr_xi + 2); |
||||
|
||||
__m256 x10 = _mm256_loadu_ps(inptr_xi + Wi); |
||||
__m256 x11 = _mm256_loadu_ps(inptr_xi + Wi + 1); |
||||
__m256 x12 = _mm256_loadu_ps(inptr_xi + Wi + 2); |
||||
|
||||
__m256 x20 = _mm256_loadu_ps(inptr_xi + Wi * 2); |
||||
__m256 x21 = _mm256_loadu_ps(inptr_xi + Wi * 2 + 1); |
||||
__m256 x22 = _mm256_loadu_ps(inptr_xi + Wi * 2 + 2); |
||||
|
||||
__m256 x30 = _mm256_loadu_ps(inptr_xi + Wi * 3); |
||||
__m256 x31 = _mm256_loadu_ps(inptr_xi + Wi * 3 + 1); |
||||
__m256 x32 = _mm256_loadu_ps(inptr_xi + Wi * 3 + 2); |
||||
|
||||
__m256 x40 = _mm256_loadu_ps(inptr_xi + Wi * 4); |
||||
__m256 x41 = _mm256_loadu_ps(inptr_xi + Wi * 4 + 1); |
||||
__m256 x42 = _mm256_loadu_ps(inptr_xi + Wi * 4 + 2); |
||||
|
||||
s0 = _mm256_fmadd_ps(x00, w0, vbias); |
||||
s1 = _mm256_fmadd_ps(x10, w0, vbias); |
||||
s2 = _mm256_fmadd_ps(x20, w0, vbias); |
||||
|
||||
s0 = _mm256_fmadd_ps(x01, w1, s0); |
||||
s1 = _mm256_fmadd_ps(x11, w1, s1); |
||||
s2 = _mm256_fmadd_ps(x21, w1, s2); |
||||
|
||||
s0 = _mm256_fmadd_ps(x02, w2, s0); |
||||
s1 = _mm256_fmadd_ps(x12, w2, s1); |
||||
s2 = _mm256_fmadd_ps(x22, w2, s2); |
||||
|
||||
s0 = _mm256_fmadd_ps(x10, w3, s0); |
||||
s1 = _mm256_fmadd_ps(x20, w3, s1); |
||||
s2 = _mm256_fmadd_ps(x30, w3, s2); |
||||
|
||||
s0 = _mm256_fmadd_ps(x11, w4, s0); |
||||
s1 = _mm256_fmadd_ps(x21, w4, s1); |
||||
s2 = _mm256_fmadd_ps(x31, w4, s2); |
||||
|
||||
s0 = _mm256_fmadd_ps(x12, w5, s0); |
||||
s1 = _mm256_fmadd_ps(x22, w5, s1); |
||||
s2 = _mm256_fmadd_ps(x32, w5, s2); |
||||
|
||||
s0 = _mm256_fmadd_ps(x20, w6, s0); |
||||
s1 = _mm256_fmadd_ps(x30, w6, s1); |
||||
s2 = _mm256_fmadd_ps(x40, w6, s2); |
||||
|
||||
s0 = _mm256_fmadd_ps(x21, w7, s0); |
||||
s1 = _mm256_fmadd_ps(x31, w7, s1); |
||||
s2 = _mm256_fmadd_ps(x41, w7, s2); |
||||
|
||||
s0 = _mm256_fmadd_ps(x22, w8, s0); |
||||
s1 = _mm256_fmadd_ps(x32, w8, s1); |
||||
s2 = _mm256_fmadd_ps(x42, w8, s2); |
||||
|
||||
if (ifMinMaxAct) |
||||
{ |
||||
s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval); |
||||
s1 = _mm256_min_ps(_mm256_max_ps(s1, vminval), vmaxval); |
||||
s2 = _mm256_min_ps(_mm256_max_ps(s2, vminval), vmaxval); |
||||
} |
||||
|
||||
_mm256_storeu_ps(outptr + x0, s0); |
||||
_mm256_storeu_ps(outptr + W0 + x0, s1); |
||||
_mm256_storeu_ps(outptr + W0 * 2 + x0, s2); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (; x0 <= x1 - VECSZ; x0 += VECSZ) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_; |
||||
__m256 s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[0]), w0, vbias); |
||||
__m256 s1 = _mm256_mul_ps(_mm256_loadu_ps(inptr_xi + ofstab[1]), w1); |
||||
__m256 s2 = _mm256_mul_ps(_mm256_loadu_ps(inptr_xi + ofstab[2]), w2); |
||||
|
||||
s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[3]), w3, s0); |
||||
s1 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[4]), w4, s1); |
||||
s2 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[5]), w5, s2); |
||||
|
||||
s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[6]), w6, s0); |
||||
s1 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[7]), w7, s1); |
||||
s2 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[8]), w8, s2); |
||||
|
||||
s0 = _mm256_add_ps(_mm256_add_ps(s0, s1), s2); |
||||
|
||||
if (ifMinMaxAct) |
||||
s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval); |
||||
_mm256_storeu_ps(outptr + x0, s0); |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (; x0 <= x1 - VECSZ; x0 += VECSZ) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left, k = 0; |
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_; |
||||
__m256 s0 = vbias; |
||||
for (; k <= ksize - 4; k += 4) |
||||
{ |
||||
__m256 v0 = _mm256_loadu_ps(inptr_xi + ofstab[k]); |
||||
__m256 v1 = _mm256_loadu_ps(inptr_xi + ofstab[k + 1]); |
||||
__m256 v2 = _mm256_loadu_ps(inptr_xi + ofstab[k + 2]); |
||||
__m256 v3 = _mm256_loadu_ps(inptr_xi + ofstab[k + 3]); |
||||
|
||||
__m256 ww0 = _mm256_set1_ps(weights[k]); |
||||
__m256 ww1 = _mm256_set1_ps(weights[k+1]); |
||||
__m256 ww2 = _mm256_set1_ps(weights[k+2]); |
||||
__m256 ww3 = _mm256_set1_ps(weights[k+3]); |
||||
|
||||
s0 = _mm256_fmadd_ps(v0, ww0, s0); |
||||
s0 = _mm256_fmadd_ps(v1, ww1, s0); |
||||
s0 = _mm256_fmadd_ps(v2, ww2, s0); |
||||
s0 = _mm256_fmadd_ps(v3, ww3, s0); |
||||
} |
||||
for (; k < ksize; k++) |
||||
s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[k]), |
||||
_mm256_set1_ps(weights[k]), s0); |
||||
|
||||
if (ifMinMaxAct) |
||||
s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval); |
||||
_mm256_storeu_ps(outptr + x0, s0); |
||||
} |
||||
} |
||||
} |
||||
|
||||
if (dy0 == 3) |
||||
{ |
||||
for (; x0 < x1; x0++) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
const float *inptr_xi = inptr + W0 * yi_ + xi_; |
||||
s_0 = s_1 = s_2 = biasval; |
||||
for (int k = 0; k < ksize; k++) { |
||||
int inp_ofs = ofstab[k]; |
||||
float w = weights[k]; |
||||
s_0 += inptr_xi[inp_ofs] * w; |
||||
s_1 += inptr_xi[inp_ofs + Wi] * w; |
||||
s_2 += inptr_xi[inp_ofs + Wi * 2] * w; |
||||
} |
||||
if (ifMinMaxAct) |
||||
{ |
||||
s_0 = std::min(std::max(s_0, minval), maxval); |
||||
s_1 = std::min(std::max(s_1, minval), maxval); |
||||
s_2 = std::min(std::max(s_2, minval), maxval); |
||||
} |
||||
|
||||
outptr[x0] = s_0; |
||||
outptr[x0 + W0] = s_1; |
||||
outptr[x0 + W0 * 2] = s_2; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (; x0 < x1; x0++) |
||||
{ |
||||
int xi_ = x0 * stride_x - pad_left; |
||||
const float *inptr_xi = inptr + Wi * yi_ + xi_; |
||||
s_0 = biasval; |
||||
for (int k = 0; k < ksize; k++) |
||||
{ |
||||
s_0 += inptr_xi[ofstab[k]] * weights[k]; |
||||
} |
||||
if (ifMinMaxAct) |
||||
s_0 = std::min(std::max(s_0, minval), maxval); |
||||
outptr[x0] = s_0; |
||||
} |
||||
} |
||||
x1 = W0; |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
} // namespace opt_AVX2
|
||||
} // namespace cv
|
@ -0,0 +1,694 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
|
||||
// Here is the original license:
|
||||
/*
|
||||
This file is a part of ficus language project. |
||||
See ficus/LICENSE for the licensing terms |
||||
*/ |
||||
|
||||
#include "../../precomp.hpp" |
||||
#include "fast_convolution.hpp" |
||||
#include "fast_convolution.simd.hpp" |
||||
|
||||
namespace cv { namespace dnn { |
||||
|
||||
Ptr<FastConv2d> initFastConv2d( |
||||
int ngroups, |
||||
int K, int C, int Hk, int Wk, |
||||
int stride_x, int stride_y, |
||||
int dilation_x, int dilation_y, |
||||
const std::vector<size_t>& pads_begin, |
||||
const std::vector<size_t>& pads_end, |
||||
float* srcWeights, |
||||
float* srcBias) |
||||
{ |
||||
Ptr<FastConv2d> conv = makePtr<FastConv2d>(); |
||||
|
||||
CV_Assert(ngroups > 0 && K > 0 && C > 0 && K % ngroups == 0); |
||||
CV_Assert(Hk > 0 && Wk > 0); |
||||
CV_Assert(stride_y > 0 && stride_x > 0); |
||||
CV_Assert(dilation_y > 0 && dilation_x > 0); |
||||
|
||||
conv->K = K; conv->C = C; conv->Hk = Hk; conv->Wk = Wk; // [K, iC, kH, kW]
|
||||
conv->stride_y = stride_y; |
||||
conv->stride_x = stride_x; |
||||
conv->dilation_y = dilation_y; |
||||
conv->dilation_x = dilation_x; |
||||
|
||||
conv->ngroups = ngroups; |
||||
conv->pad_top = pads_begin[0]; |
||||
conv->pad_bottom = pads_end[0]; |
||||
conv->pad_left = pads_begin[1]; |
||||
conv->pad_right = pads_end[1]; |
||||
|
||||
// store bias; append some zero's to make sure that
|
||||
// we can always read FAST_CONV_MR elements starting from any valid index
|
||||
{ |
||||
int k = 0, nbias = K + FAST_CONV_MR-1; |
||||
conv->biasBuf.reserve(nbias); |
||||
float* biasBufPtr = conv->biasBuf.data(); |
||||
for(; k < K; k++) |
||||
biasBufPtr[k] = srcBias ? srcBias[k] : 0.f; |
||||
for(; k < nbias; k++) |
||||
biasBufPtr[k] = 0.f; |
||||
} |
||||
|
||||
#if CV_NEON // For now, winograd is ARM platform only.
|
||||
if (ngroups == 1 && Hk ==3 && Wk == 3 && stride_x == 1 && stride_y == 1 && dilation_x == 1 && dilation_y ==1 |
||||
&& K >= 16 && C >= 16 ) |
||||
conv->ifWinograd63 = true; |
||||
#else |
||||
conv->ifWinograd63 = false; |
||||
#endif |
||||
|
||||
if (ngroups > 1 && ngroups == K && ngroups == C) |
||||
{ |
||||
// for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
|
||||
// but add some padding to make the weights array layout more SIMD-friendly
|
||||
int ksize = Hk*Wk; |
||||
int padded_ksize = ((ksize + FAST_VEC_NLANES-1)/FAST_VEC_NLANES)*FAST_VEC_NLANES; // this code aims to let memory fit with vector size.
|
||||
int nweights = C*padded_ksize; |
||||
conv->weightsBuf.reserve(nweights); |
||||
float* weightsBufPtr = conv->weightsBuf.data(); |
||||
memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0])); |
||||
for(int c = 0; c < C; c++) |
||||
{ |
||||
for (int k = 0; k < ksize; k++) |
||||
weightsBufPtr[c*padded_ksize + k] = srcWeights[c*ksize + k]; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
// The weights are packed as
|
||||
// ngroups x (ceil((K/ngroups)/FAST_CONV_MR)*FAST_CONV_MR) x (Cg*Hk*Wk) x FAST_CONV_MR tensor
|
||||
int Kg = K/ngroups, Cg = max(C/ngroups, 1); |
||||
int Kg_aligned = ((Kg + FAST_CONV_MR - 1)/FAST_CONV_MR)*FAST_CONV_MR; |
||||
size_t nweights = ngroups*Kg_aligned*Cg*Hk*Wk; |
||||
conv->weightsBuf.reserve(nweights); |
||||
float* weightsBufPtr = conv->weightsBuf.data(); |
||||
memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0])); |
||||
float* packed_wptr = weightsBufPtr; |
||||
|
||||
// pack the weight.
|
||||
for(int g = 0; g < ngroups; g++) |
||||
{ |
||||
for(int k0 = 0; k0 < Kg_aligned; k0 += FAST_CONV_MR) |
||||
{ |
||||
int dk = Kg - k0 < FAST_CONV_MR ? Kg - k0 : FAST_CONV_MR; |
||||
for(int c = 0; c < Cg; c++) |
||||
{ |
||||
for(int yx = 0; yx < Hk*Wk; yx++, packed_wptr += FAST_CONV_MR) |
||||
{ |
||||
const float* wptr = srcWeights + ((g*Kg + k0)*Cg + c)*Hk*Wk + yx; |
||||
int k = 0; |
||||
for(; k < dk; k++, wptr += Cg*Hk*Wk) |
||||
packed_wptr[k] = *wptr; |
||||
for(; k < FAST_CONV_MR; k++) |
||||
packed_wptr[k] = 0.f; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Prepare Weight for Winograd F(6x6, 3x3)
|
||||
if (conv->ifWinograd63) |
||||
{ |
||||
initWinograd63(conv, srcWeights, K, C); |
||||
} |
||||
} |
||||
return conv; |
||||
} |
||||
|
||||
static void packInput(float* inpbuf, const float* inptr, int* yxtab, int ksize, int Cg, int Hi, int Wi, int W0, |
||||
int pad_top, int pad_left, int stride_x, int stride_y, int yx0, int slice_len, |
||||
bool fast_1x1, bool partial0, bool s1d1p0, bool s1d1) |
||||
{ |
||||
const size_t inp_planesize = (size_t)Hi*Wi; |
||||
|
||||
if (fast_1x1) |
||||
{ |
||||
/*
|
||||
super-fast branch for 1x1 convolutions with sy=sx=1. |
||||
in this case each feature plane can be safely treated |
||||
as 1D array and we just extract next portion |
||||
of FAST_CONV_NR elements from each feature plane and |
||||
put it together. |
||||
*/ |
||||
inptr += yx0; |
||||
if (!partial0) |
||||
{ |
||||
// Make special branch where memcpy() is called with a constant buffer size.
|
||||
// Compilers will likely unroll this loop properly.
|
||||
for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += FAST_CONV_NR) |
||||
memcpy(inpbuf, inptr, FAST_CONV_NR * sizeof(inpbuf[0])); |
||||
} |
||||
else |
||||
{ |
||||
for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += FAST_CONV_NR) |
||||
{ |
||||
memcpy(inpbuf, inptr, slice_len * sizeof(inpbuf[0])); |
||||
memset(inpbuf + slice_len, 0, (FAST_CONV_NR - slice_len) * sizeof(inpbuf[0])); |
||||
} |
||||
} |
||||
} |
||||
else if (s1d1p0) |
||||
{ |
||||
/*
|
||||
slower, but still fast branch for sy=sx=1, dy=dx=1 and without padding, |
||||
in this case we copy data from input tensors by chunks. |
||||
*/ |
||||
for (int c = 0; c < Cg; c++) |
||||
{ |
||||
float *inpbuf_c = inpbuf + c * (FAST_CONV_NR * ksize); |
||||
const float *inptr_c = inptr + c * inp_planesize; |
||||
|
||||
for (int k = 0; k < ksize; k++) |
||||
{ |
||||
int y0 = yx0 / W0, x0 = yx0 % W0; |
||||
int yi = y0 + yxtab[k * 2], xi = x0 + yxtab[k * 2 + 1]; |
||||
float *inpbuf_k = inpbuf_c + k * FAST_CONV_NR; |
||||
int xi_0 = yxtab[k * 2 + 1]; |
||||
|
||||
int i = 0; |
||||
for (; i < slice_len;) |
||||
{ |
||||
const float *inptr_k = inptr_c + yi * Wi + xi; |
||||
int copy_len = std::min(slice_len - i, W0 - x0); |
||||
int di_z = (slice_len == i + copy_len) ? FAST_CONV_NR - slice_len : 0; |
||||
|
||||
memcpy(inpbuf_k + i, |
||||
inptr_k, |
||||
copy_len * sizeof(inpbuf_k[0])); |
||||
|
||||
memset(inpbuf_k + i + copy_len, |
||||
0, di_z * sizeof(inpbuf_k[0])); |
||||
|
||||
i += copy_len; |
||||
x0 = 0; |
||||
xi = xi_0; |
||||
yi++; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
else if (s1d1) |
||||
{ |
||||
/*
|
||||
slower, but still fast branch for sy=sx=1, dy=dx=1. |
||||
in this case we copy data from input tensors by chunks and |
||||
interleave the data in inpbuf with 0's |
||||
(that correspond to the padding elements) when necessary |
||||
*/ |
||||
int y0 = yx0 / W0, x0 = yx0 % W0; |
||||
for (int c = 0; c < Cg; c++) |
||||
{ |
||||
float *inpbuf_c = inpbuf + c * (FAST_CONV_NR * ksize); |
||||
const float *inptr_c = inptr + c * inp_planesize; |
||||
|
||||
for (int k = 0; k < ksize; k++) |
||||
{ |
||||
int x0_tmp = x0; |
||||
|
||||
int xi_0 = yxtab[k * 2 + 1] - pad_left; |
||||
|
||||
int yi = y0 + yxtab[k * 2] - pad_top, xi = x0_tmp + xi_0; |
||||
float *inpbuf_k = inpbuf_c + k * FAST_CONV_NR; |
||||
|
||||
int i = 0; |
||||
for (; i < slice_len;) { |
||||
int copyLen = std::min(slice_len - i, W0 - x0_tmp); |
||||
|
||||
int di_z = (i + copyLen == slice_len) ? FAST_CONV_NR - slice_len |
||||
: 0; // The final padding.
|
||||
// pad_top or pad bottom
|
||||
if (yi < 0 || yi > Hi - 1) |
||||
{ |
||||
memset(inpbuf_k + i, |
||||
0, (copyLen + di_z) * sizeof(inpbuf_k[0])); |
||||
i += copyLen + di_z; |
||||
} |
||||
else |
||||
{ |
||||
int x_pad_left = 0, x_pad_right = 0; |
||||
|
||||
// pad_left
|
||||
if (xi < 0) |
||||
{ |
||||
x_pad_left = std::min(-xi, copyLen); |
||||
xi = 0; |
||||
copyLen -= x_pad_left; |
||||
} |
||||
|
||||
memset(inpbuf_k + i, |
||||
0, x_pad_left * sizeof(inpbuf_k[0])); |
||||
i += x_pad_left; |
||||
|
||||
// pad right
|
||||
if (xi + copyLen > Wi) |
||||
{ |
||||
if (xi > Wi) |
||||
{ |
||||
x_pad_right = copyLen; |
||||
copyLen = 0; |
||||
} |
||||
else |
||||
{ |
||||
x_pad_right = std::min(xi + copyLen - Wi, copyLen); |
||||
copyLen -= x_pad_right; |
||||
} |
||||
} |
||||
|
||||
CV_Assert(copyLen >= 0); |
||||
|
||||
const float *inptr_k = inptr_c + yi * Wi + xi; |
||||
memcpy(inpbuf_k + i, |
||||
inptr_k, |
||||
copyLen * sizeof(inpbuf_k[0])); |
||||
|
||||
i += copyLen; |
||||
|
||||
// pad_right and the final padding.
|
||||
memset(inpbuf_k + i, |
||||
0, (di_z + x_pad_right) * sizeof(inpbuf_k[0])); |
||||
i += x_pad_right + di_z; |
||||
} |
||||
|
||||
x0_tmp = 0; |
||||
xi = xi_0; |
||||
yi++; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0; |
||||
for (int k = 0; k < ksize; k++) |
||||
{ |
||||
int dy = yxtab[k * 2], dx = yxtab[k * 2 + 1]; |
||||
int i = 0, y0 = y0_, x0 = x0_; |
||||
for (; i < FAST_CONV_NR;) |
||||
{ |
||||
float *inpbuf_ki = inpbuf + k * FAST_CONV_NR + i; |
||||
int yi = y0 * stride_y + dy - pad_top; |
||||
int xi = x0 * stride_x + dx - pad_left; |
||||
|
||||
if ((unsigned) yi < (unsigned) Hi && |
||||
(unsigned) xi < (unsigned) Wi) |
||||
{ |
||||
const float *inptr_ki = inptr + yi * Wi + xi; |
||||
if (i + 4 <= FAST_CONV_NR && x0 + 4 <= W0 && xi + stride_x * 4 <= Wi) |
||||
{ |
||||
if (stride_x == 2) { |
||||
for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR * |
||||
ksize, inptr_ki += inp_planesize) |
||||
{ |
||||
float t0 = inptr_ki[0], t1 = inptr_ki[2]; |
||||
float t2 = inptr_ki[4], t3 = inptr_ki[6]; |
||||
inpbuf_ki[0] = t0; |
||||
inpbuf_ki[1] = t1; |
||||
inpbuf_ki[2] = t2; |
||||
inpbuf_ki[3] = t3; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR * |
||||
ksize, inptr_ki += inp_planesize) |
||||
{ |
||||
float t0 = inptr_ki[0], t1 = inptr_ki[stride_x]; |
||||
float t2 = inptr_ki[stride_x * 2], t3 = inptr_ki[stride_x * 3]; |
||||
inpbuf_ki[0] = t0; |
||||
inpbuf_ki[1] = t1; |
||||
inpbuf_ki[2] = t2; |
||||
inpbuf_ki[3] = t3; |
||||
} |
||||
} |
||||
i += 4; |
||||
x0 += 4; |
||||
} |
||||
else |
||||
{ |
||||
for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR * |
||||
ksize, inptr_ki += inp_planesize) |
||||
*inpbuf_ki = *inptr_ki; |
||||
i++; |
||||
x0++; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR * ksize) |
||||
inpbuf_ki[0] = 0.f; |
||||
i++; |
||||
x0++; |
||||
} |
||||
int mask = x0 >= W0; |
||||
y0 += mask; |
||||
x0 &= mask - 1; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
static void matMulCompute(float* outptr0, float* inpbuf_task, float* cbuf, const Ptr<FastConv2d>& conv, int HkWkCg, |
||||
int k0, int k1, int yx0, int yx1, size_t out_planesize, int g, int Kg, int Kg_aligned, |
||||
bool partial0, ActivationLayer*& activ, float minval, float maxval, bool ifMinMaxAct) |
||||
{ |
||||
int outstep0 = out_planesize; |
||||
|
||||
for (int k = k0; k < k1; k += FAST_CONV_MR, outptr0 += outstep0 * FAST_CONV_MR) |
||||
{ |
||||
int dk = Kg - k < FAST_CONV_MR ? Kg - k : FAST_CONV_MR; |
||||
bool partial = partial0 || dk < FAST_CONV_MR; |
||||
float *outptr = outptr0; |
||||
|
||||
int outstep = outstep0; |
||||
if (partial) |
||||
{ |
||||
outptr = cbuf; |
||||
outstep = FAST_CONV_NR; |
||||
} |
||||
|
||||
|
||||
#if CV_TRY_AVX2 |
||||
if (conv->useAVX2) |
||||
opt_AVX2::convBlock_AVX2( HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg, |
||||
inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k, |
||||
minval, maxval, ifMinMaxAct); |
||||
else |
||||
#endif |
||||
#if CV_TRY_NEON |
||||
if (conv->useNEON) |
||||
opt_NEON::convBlock_NEON(HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg, |
||||
inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k, |
||||
minval, maxval, ifMinMaxAct); |
||||
else |
||||
#endif |
||||
convBlock(HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg, |
||||
inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k, |
||||
minval, maxval, ifMinMaxAct); |
||||
|
||||
// activation
|
||||
if (activ) |
||||
activ->forwardSlice(outptr, outptr, yx1 - yx0, outstep, Kg * g + k, |
||||
Kg * g + k + dk); |
||||
|
||||
if (partial) |
||||
{ |
||||
for (int i = 0; i < dk; i++) |
||||
memcpy(outptr0 + i * outstep0, cbuf + i * FAST_CONV_NR, |
||||
(yx1 - yx0) * sizeof(cbuf[0])); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void runFastConv2d(InputArray _input, OutputArray _output, |
||||
const Ptr<FastConv2d>& conv, int ntasks, const Ptr<ActivationLayer>& actLayer) |
||||
{ |
||||
Mat input = _input.getMat(); |
||||
Mat output = _output.getMat(); |
||||
MatShape inputShape = shape(input); |
||||
MatShape outputShape = shape(output); |
||||
CV_Assert(inputShape.size() == 4 && outputShape.size() == 4); |
||||
|
||||
ActivationLayer* activ = 0; |
||||
float minval = -FLT_MAX, maxval = FLT_MAX; |
||||
bool ifMinMaxAct = false; |
||||
if (actLayer) |
||||
{ |
||||
Ptr<ReLULayer> activ_relu = actLayer.dynamicCast<ReLULayer>(); |
||||
Ptr<ReLU6Layer> activ_relu6 = actLayer.dynamicCast<ReLU6Layer>(); |
||||
|
||||
if (!activ_relu.empty()) |
||||
{ |
||||
if (activ_relu->negativeSlope == 0.0f) |
||||
{ |
||||
minval = 0.0f; |
||||
ifMinMaxAct = true; |
||||
activ = nullptr; |
||||
} |
||||
else // Leaky ReLU
|
||||
{ |
||||
activ = actLayer.get(); |
||||
} |
||||
} |
||||
else if (!activ_relu6.empty()) |
||||
{ |
||||
minval = activ_relu6->minValue; |
||||
maxval = activ_relu6->maxValue; |
||||
|
||||
ifMinMaxAct = true; |
||||
activ = nullptr; |
||||
} |
||||
else |
||||
activ = actLayer.get(); |
||||
} |
||||
else |
||||
activ = nullptr; |
||||
|
||||
if (conv->ngroups > 1 && conv->ngroups == conv->K && conv->ngroups == conv->C) |
||||
{ |
||||
return runDepthwise(input, output, conv, minval, maxval, activ, ifMinMaxAct); |
||||
} |
||||
|
||||
#if CV_NEON |
||||
if ( conv->ifWinograd63 |
||||
&& inputShape[2] > 12 && inputShape[3] > 12 |
||||
&& inputShape[2] < 120 && inputShape[3] < 120 ) |
||||
{ |
||||
// In general, for winograd branch, more cores will give better performance.
|
||||
int maxNumThread = std::max(getNumThreads(), 1); |
||||
if (runWinograd63(input, output, conv, maxNumThread, minval, maxval, activ, ifMinMaxAct)) |
||||
return; |
||||
} |
||||
#endif |
||||
|
||||
float* inp = input.ptr<float>(); |
||||
float* out = output.ptr<float>(); |
||||
|
||||
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]
|
||||
int K = conv->K, Hk = conv->Hk, Wk = conv->Wk; |
||||
int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups; // ngroups
|
||||
int Cg = C/ngroups, Kg = K/ngroups; |
||||
int Kg_nblocks = (Kg + FAST_CONV_MR-1)/FAST_CONV_MR, Kg_aligned = Kg_nblocks*FAST_CONV_MR; // align to MR
|
||||
|
||||
const size_t inp_planesize = (size_t)Hi*Wi; |
||||
const size_t out_planesize = (size_t)H0*W0; |
||||
|
||||
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom; |
||||
int pad_left = conv->pad_left; |
||||
int pad_right = conv->pad_right; |
||||
|
||||
int stride_y = conv->stride_y, stride_x = conv->stride_x; |
||||
int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x; |
||||
|
||||
int ksize = Hk * Wk; |
||||
bool s1d1 = stride_x == 1 && stride_y == 1 && dilation_x == 1 && dilation_y == 1; |
||||
bool s1d1p0 = s1d1 && pad_top == 0 && pad_left ==0 && pad_bottom == 0 && pad_right == 0; |
||||
bool fast_1x1 = stride_x == 1 && stride_y == 1 && ksize == 1; |
||||
int HkWkCg = Hk*Wk*Cg; |
||||
|
||||
enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F }; |
||||
size_t taskbufsize = FAST_CONV_NR*HkWkCg; // input buffer
|
||||
size_t taskbufsizeOutput = FAST_CONV_NR * FAST_CONV_MR; |
||||
size_t inputbufsize = 0; |
||||
size_t outbufsize = ntasks * taskbufsizeOutput; |
||||
|
||||
int stripes_per_sample = (out_planesize + FAST_CONV_NR - 1)/FAST_CONV_NR; // align to NR
|
||||
size_t hw_task = stripes_per_sample; |
||||
size_t hw_aligned = stripes_per_sample * FAST_CONV_NR; |
||||
|
||||
bool separatedLoop = false; |
||||
|
||||
if (stripes_per_sample < 4 * ntasks) |
||||
{ |
||||
// If stripes_per_sample is small, we parallelize on K (output channel).
|
||||
stripes_per_sample = 1; |
||||
|
||||
// Separated Parallelloop could save much time in packing input data. But it may cost more memory, we use it when batch size is 1.
|
||||
if (N == 1) |
||||
{ |
||||
separatedLoop = true; |
||||
inputbufsize = ngroups * hw_aligned * HkWkCg; |
||||
} |
||||
|
||||
if (!separatedLoop) |
||||
{ |
||||
inputbufsize = taskbufsize * ntasks; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
// If stripes_per_sample is big, we parallelize on H0*W0.
|
||||
Kg_nblocks = 1; |
||||
inputbufsize = taskbufsize * ntasks; |
||||
} |
||||
|
||||
int Kstripes = Kg_nblocks*stripes_per_sample; |
||||
int nsubtasks = N*ngroups*Kstripes; |
||||
|
||||
AutoBuffer<float> inpbuf_all_, outputbuf_; |
||||
inputbufsize = alignSize(inputbufsize, VEC_ALIGN); |
||||
inpbuf_all_.allocate(inputbufsize + VEC_ALIGN); |
||||
float* inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN*sizeof(float))); |
||||
|
||||
outbufsize = alignSize(outbufsize, VEC_ALIGN); |
||||
outputbuf_.allocate(outbufsize + VEC_ALIGN); |
||||
float* output_buf = alignPtr(outputbuf_.data(), (int)(VEC_ALIGN*sizeof(float))); |
||||
|
||||
std::vector<int> ofstab_(Hk*Wk*3, 0); |
||||
int* ofstab = ofstab_.data(); |
||||
int* yxtab = ofstab + Hk*Wk; |
||||
|
||||
for (int y = 0; y < Hk; y++) |
||||
for( int x = 0; x < Wk; x++) |
||||
{ |
||||
int k = y*Wk + x; |
||||
int dy = y*dilation_y, dx = x*dilation_x; |
||||
yxtab[k*2] = dy; |
||||
yxtab[k*2+1] = dx; |
||||
ofstab[k] = dy*Wi + dx; |
||||
} |
||||
|
||||
if (ksize == 1) |
||||
{ |
||||
CV_Assert(pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0); |
||||
CV_Assert(stride_x != 1 || stride_y != 1 || (H0 == Hi && W0 == Wi)); |
||||
} |
||||
|
||||
if (separatedLoop) |
||||
{ |
||||
// For now this branch only handles batch size = 1. Maybe we could support batch size < 10 in the future.
|
||||
// Pack Input data
|
||||
parallel_for_(Range(0, ngroups * hw_task), [&](const Range& r0) |
||||
{ |
||||
for (int nhwi = r0.start; nhwi < r0.end; nhwi++) |
||||
{ |
||||
int g = nhwi/hw_task; |
||||
int hw_i = nhwi % hw_task; |
||||
int hw0 = hw_i * FAST_CONV_NR; |
||||
float* inpbuf = inpbuf_all + g * hw_aligned * HkWkCg + hw0 * HkWkCg; |
||||
const float* inptr = inp + g * Cg * inp_planesize; |
||||
bool partial0 = hw0 + FAST_CONV_NR > out_planesize? true: false; |
||||
int slice_len = FAST_CONV_NR; |
||||
|
||||
if (partial0) |
||||
slice_len = out_planesize - hw0; |
||||
|
||||
packInput(inpbuf, inptr, yxtab, ksize, Cg, Hi, Wi, W0, pad_top, pad_left, stride_x, stride_y, |
||||
hw0, slice_len, fast_1x1, partial0, s1d1p0, s1d1); |
||||
} |
||||
}); |
||||
|
||||
// Compute
|
||||
parallel_for_(Range(0, ntasks), [&](const Range& r0) |
||||
{ |
||||
for (int task_id = r0.start; task_id < r0.end; task_id++) |
||||
{ |
||||
float *cbuf = output_buf + task_id * taskbufsizeOutput; |
||||
int ngs0 = (int) ((size_t) nsubtasks * task_id / ntasks); |
||||
int ngs1 = (int) ((size_t) nsubtasks * (task_id + 1) / ntasks); |
||||
for (int subtask = ngs0; subtask < ngs1;) |
||||
{ |
||||
int ng = subtask / Kstripes; |
||||
int kyx0 = subtask - ng * Kstripes; |
||||
int kyx1 = kyx0 + (ngs1 - subtask); |
||||
int n = ng / ngroups, g = ng - n * ngroups; |
||||
|
||||
CV_Assert(n <= 1); |
||||
|
||||
kyx1 = kyx1 <= Kstripes ? kyx1 : Kstripes; // Guarantee that maximum kyx1 is Kstripes.
|
||||
subtask += kyx1 - kyx0; |
||||
|
||||
int k0 = kyx0 * FAST_CONV_MR; |
||||
int k1 = kyx1 * FAST_CONV_MR; |
||||
k1 = k1 <= Kg ? k1 : Kg; |
||||
|
||||
|
||||
for (int yx0 = 0; yx0 < out_planesize; yx0 += FAST_CONV_NR) |
||||
{ |
||||
float* inpbuf_task = inpbuf_all + g * hw_aligned * HkWkCg + yx0 * HkWkCg; |
||||
int yx1 = yx0 + FAST_CONV_NR; |
||||
yx1 = yx1 <= out_planesize ? yx1 : out_planesize; |
||||
int slice_len = yx1 - yx0; |
||||
bool partial0 = slice_len < FAST_CONV_NR; |
||||
|
||||
int outstep0 = out_planesize; |
||||
size_t outofs = ((n * ngroups + g) * Kg + k0) * outstep0 + yx0; |
||||
float *outptr0 = out + outofs; |
||||
|
||||
matMulCompute(outptr0, inpbuf_task, cbuf, conv, HkWkCg, k0, k1, yx0, yx1, out_planesize, g, |
||||
Kg, Kg_aligned, partial0, activ, minval, maxval, ifMinMaxAct); |
||||
} |
||||
} |
||||
} |
||||
}); |
||||
} |
||||
else |
||||
{ |
||||
parallel_for_(Range(0, ntasks), [&](const Range &r0) { |
||||
for (int task_id = r0.start; task_id < r0.end; task_id++) { |
||||
float *inpbuf_task = &inpbuf_all[taskbufsize * task_id]; |
||||
float *cbuf = output_buf + task_id * taskbufsizeOutput; |
||||
int ngs0 = (int) ((size_t) nsubtasks * task_id / ntasks); |
||||
int ngs1 = (int) ((size_t) nsubtasks * (task_id + 1) / ntasks); |
||||
|
||||
for (int subtask = ngs0; subtask < ngs1;) |
||||
{ |
||||
int ng = subtask / Kstripes; |
||||
int kyx0 = subtask - ng * Kstripes; |
||||
int kyx1 = kyx0 + (ngs1 - subtask); |
||||
int n = ng / ngroups, g = ng - n * ngroups; |
||||
size_t inp_plane_ofs = (size_t) (n * ngroups + g) * Cg * inp_planesize; |
||||
kyx1 = kyx1 <= Kstripes ? kyx1 : Kstripes; // Guarantee that maximum kyx1 is Kstripes.
|
||||
subtask += kyx1 - kyx0; |
||||
int k0, k1; |
||||
int yx0, yx_limit; |
||||
|
||||
if (stripes_per_sample == 1) |
||||
{ |
||||
k0 = kyx0 * FAST_CONV_MR; |
||||
k1 = kyx1 * FAST_CONV_MR; |
||||
k1 = k1 <= Kg ? k1 : Kg; |
||||
yx0 = 0; |
||||
yx_limit = out_planesize; |
||||
} |
||||
else |
||||
{ |
||||
k0 = 0; |
||||
k1 = Kg; |
||||
yx0 = kyx0 * FAST_CONV_NR; |
||||
yx_limit = kyx1 * FAST_CONV_NR; |
||||
yx_limit = yx_limit < out_planesize ? yx_limit : out_planesize; |
||||
} |
||||
|
||||
for (; yx0 < yx_limit; yx0 += FAST_CONV_NR) |
||||
{ |
||||
float *inpbuf = inpbuf_task; |
||||
const float *inptr = inp + inp_plane_ofs; |
||||
int yx1 = yx0 + FAST_CONV_NR; |
||||
yx1 = yx1 <= yx_limit ? yx1 : yx_limit; |
||||
int slice_len = yx1 - yx0; |
||||
bool partial0 = slice_len < FAST_CONV_NR; |
||||
packInput(inpbuf, inptr, yxtab, ksize, Cg, Hi, Wi, W0, pad_top, pad_left, stride_x, stride_y, |
||||
yx0, slice_len, fast_1x1, partial0, s1d1p0, s1d1); |
||||
|
||||
// 2. do convolution, compute Kg x (yx1 - yx0) part of the output tensor
|
||||
int outstep0 = out_planesize; |
||||
size_t outofs = ((n * ngroups + g) * Kg + k0) * outstep0 + yx0; |
||||
float *outptr0 = out + outofs; |
||||
|
||||
matMulCompute(outptr0, inpbuf_task, cbuf, conv, HkWkCg, k0, k1, yx0, yx1, out_planesize, g, |
||||
Kg, Kg_aligned, partial0, activ, minval, maxval, ifMinMaxAct); |
||||
} |
||||
} |
||||
} |
||||
}); |
||||
} |
||||
} |
||||
|
||||
}} // namespace cv::dnn
|
@ -0,0 +1,89 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_FAST_CONVOLUTION_HPP |
||||
#define OPENCV_FAST_CONVOLUTION_HPP |
||||
|
||||
#include "opencv2/core/hal/intrin.hpp" |
||||
|
||||
#ifndef FAST_CONV_PRAM |
||||
#define FAST_CONV_PRAM |
||||
#if CV_NEON && __aarch64__ // 32 registers.
|
||||
#define FAST_CONV_MR 4 |
||||
#define FAST_CONV_NR 28 |
||||
enum { FAST_VEC_NLANES=4 }; |
||||
#elif CV_NEON // 16 registers.
|
||||
#define FAST_CONV_MR 4 |
||||
#define FAST_CONV_NR 12 |
||||
enum { FAST_VEC_NLANES=4 }; |
||||
#else // SIMD 128, AVX or AVX2
|
||||
#define FAST_CONV_MR 4 |
||||
#define FAST_CONV_NR 24 |
||||
enum { FAST_VEC_NLANES=4 }; |
||||
#endif |
||||
#endif |
||||
|
||||
namespace cv { |
||||
namespace dnn { |
||||
|
||||
struct FastConv2d |
||||
{ |
||||
int ngroups; |
||||
int K, C, Hk, Wk; |
||||
int stride_y, stride_x; |
||||
int dilation_y, dilation_x; |
||||
int pad_top, pad_bottom, pad_left, pad_right; |
||||
|
||||
std::vector<float> weightsBuf; // For generic Conv 2D
|
||||
std::vector<float> weightsWino63Buf; // For Winograd F(6x6, 3x3).
|
||||
|
||||
std::vector<float> biasBuf; |
||||
bool ifWinograd63 = false; |
||||
bool useAVX2 = checkHardwareSupport(CPU_AVX2); |
||||
bool useNEON = checkHardwareSupport(CPU_NEON); |
||||
}; |
||||
|
||||
// return a FastConv2d instance.
|
||||
Ptr<FastConv2d> initFastConv2d( |
||||
int ngroups, |
||||
int K, int C, int Hk, int Wk, |
||||
int stride_x, int stride_y, |
||||
int dilation_x, int dilation_y, |
||||
const std::vector<size_t>& pads_begin, |
||||
const std::vector<size_t>& pads_end, |
||||
float* srcWeights, |
||||
float* srcBias); |
||||
|
||||
// It contains different computing branches, like winograd, 1x1 conv.
|
||||
void runFastConv2d(InputArray _input, OutputArray _output, |
||||
const Ptr<FastConv2d>& conv, int ntasks, const Ptr<ActivationLayer>& actLayer); |
||||
|
||||
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval, |
||||
ActivationLayer* activ, bool ifMinMaxAct); |
||||
|
||||
// winograd init
|
||||
void initWinograd63(Ptr<FastConv2d>& conv, float* src_weight, int K, int C); |
||||
|
||||
int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks, |
||||
float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct); |
||||
|
||||
} // namespace dnn
|
||||
|
||||
namespace opt_AVX2 |
||||
{ |
||||
#if CV_TRY_AVX2 |
||||
void convBlock_AVX2(int k, const float *a, const float *b, |
||||
float *c, int ldc, const float *bias, |
||||
float minval, float maxval, bool ifActiv); |
||||
|
||||
void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab, |
||||
float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left, |
||||
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop, |
||||
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3); |
||||
#endif |
||||
} // namespace opt_AVX2
|
||||
|
||||
} // namespace cv
|
||||
|
||||
#endif //OPENCV_FAST_CONVOLUTION_HPP
|
@ -0,0 +1,342 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_FAST_CONVOLUTION_SIMD_HPP |
||||
#define OPENCV_FAST_CONVOLUTION_SIMD_HPP |
||||
|
||||
#include "opencv2/core/hal/intrin.hpp" |
||||
#include <opencv2/core/utils/logger.hpp> |
||||
|
||||
namespace cv { |
||||
namespace dnn { |
||||
|
||||
void convBlock(int k, const float *a, const float *b, |
||||
float *c, int ldc, const float *bias, |
||||
float minval, float maxval, bool ifActiv) |
||||
{ |
||||
#if CV_SIMD128 |
||||
#if FAST_CONV_MR == 4 && FAST_CONV_NR == 24 |
||||
{ |
||||
v_float32x4 c0 = v_setall_f32(bias[0]), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0; |
||||
v_float32x4 c6 = v_setall_f32(bias[1]), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6; |
||||
v_float32x4 c12 = v_setall_f32(bias[2]), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12; |
||||
v_float32x4 c18 = v_setall_f32(bias[3]), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18; |
||||
|
||||
for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR) |
||||
{ |
||||
v_float32x4 a0 = v_setall_f32(a[0]); |
||||
v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); |
||||
v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); |
||||
|
||||
c0 = v_fma(b0, a0, c0); |
||||
c1 = v_fma(b1, a0, c1); |
||||
c2 = v_fma(b2, a0, c2); |
||||
c3 = v_fma(b3, a0, c3); |
||||
c4 = v_fma(b4, a0, c4); |
||||
c5 = v_fma(b5, a0, c5); |
||||
|
||||
a0 = v_setall_f32(a[1]); |
||||
c6 = v_fma(b0, a0, c6); |
||||
c7 = v_fma(b1, a0, c7); |
||||
c8 = v_fma(b2, a0, c8); |
||||
c9 = v_fma(b3, a0, c9); |
||||
c10 = v_fma(b4, a0, c10); |
||||
c11 = v_fma(b5, a0, c11); |
||||
|
||||
a0 = v_setall_f32(a[2]); |
||||
c12 = v_fma(b0, a0, c12); |
||||
c13 = v_fma(b1, a0, c13); |
||||
c14 = v_fma(b2, a0, c14); |
||||
c15 = v_fma(b3, a0, c15); |
||||
c16 = v_fma(b4, a0, c16); |
||||
c17 = v_fma(b5, a0, c17); |
||||
|
||||
a0 = v_setall_f32(a[3]); |
||||
c18 = v_fma(b0, a0, c18); |
||||
c19 = v_fma(b1, a0, c19); |
||||
c20 = v_fma(b2, a0, c20); |
||||
c21 = v_fma(b3, a0, c21); |
||||
c22 = v_fma(b4, a0, c22); |
||||
c23 = v_fma(b5, a0, c23); |
||||
} |
||||
|
||||
if (ifActiv) { |
||||
v_float32x4 vmin = v_setall_f32(minval), vmax = v_setall_f32(maxval); |
||||
c0 = v_min(v_max(c0, vmin), vmax); |
||||
c1 = v_min(v_max(c1, vmin), vmax); |
||||
c2 = v_min(v_max(c2, vmin), vmax); |
||||
c3 = v_min(v_max(c3, vmin), vmax); |
||||
c4 = v_min(v_max(c4, vmin), vmax); |
||||
c5 = v_min(v_max(c5, vmin), vmax); |
||||
c6 = v_min(v_max(c6, vmin), vmax); |
||||
c7 = v_min(v_max(c7, vmin), vmax); |
||||
c8 = v_min(v_max(c8, vmin), vmax); |
||||
c9 = v_min(v_max(c9, vmin), vmax); |
||||
c10 = v_min(v_max(c10, vmin), vmax); |
||||
c11 = v_min(v_max(c11, vmin), vmax); |
||||
c12 = v_min(v_max(c12, vmin), vmax); |
||||
c13 = v_min(v_max(c13, vmin), vmax); |
||||
c14 = v_min(v_max(c14, vmin), vmax); |
||||
c15 = v_min(v_max(c15, vmin), vmax); |
||||
c16 = v_min(v_max(c16, vmin), vmax); |
||||
c17 = v_min(v_max(c17, vmin), vmax); |
||||
c18 = v_min(v_max(c18, vmin), vmax); |
||||
c19 = v_min(v_max(c19, vmin), vmax); |
||||
c20 = v_min(v_max(c20, vmin), vmax); |
||||
c21 = v_min(v_max(c21, vmin), vmax); |
||||
c22 = v_min(v_max(c22, vmin), vmax); |
||||
c23 = v_min(v_max(c23, vmin), vmax); |
||||
} |
||||
v_store(c, c0); |
||||
v_store(c + 4, c1); |
||||
v_store(c + 8, c2); |
||||
v_store(c + 12, c3); |
||||
v_store(c + 16, c4); |
||||
v_store(c + 20, c5); |
||||
|
||||
v_store(c + ldc, c6); |
||||
v_store(c + ldc + 4, c7); |
||||
v_store(c + ldc + 8, c8); |
||||
v_store(c + ldc + 12, c9); |
||||
v_store(c + ldc + 16, c10); |
||||
v_store(c + ldc + 20, c11); |
||||
|
||||
v_store(c + ldc * 2, c12); |
||||
v_store(c + ldc * 2 + 4, c13); |
||||
v_store(c + ldc * 2 + 8, c14); |
||||
v_store(c + ldc * 2 + 12, c15); |
||||
v_store(c + ldc * 2 + 16, c16); |
||||
v_store(c + ldc * 2 + 20, c17); |
||||
|
||||
v_store(c + ldc * 3, c18); |
||||
v_store(c + ldc * 3 + 4, c19); |
||||
v_store(c + ldc * 3 + 8, c20); |
||||
v_store(c + ldc * 3 + 12, c21); |
||||
v_store(c + ldc * 3 + 16, c22); |
||||
v_store(c + ldc * 3 + 20, c23); |
||||
} |
||||
#endif |
||||
#else |
||||
for (int i = 0; i < FAST_CONV_MR; i++) |
||||
{ |
||||
float beta = bias[i]; |
||||
for (int j = 0; j < FAST_CONV_NR; j++) |
||||
c[i*ldc + j] = beta; |
||||
} |
||||
for (int p = 0; p < k; p++) |
||||
{ |
||||
for (int i = 0; i < FAST_CONV_MR; i++) |
||||
{ |
||||
float alpha = a[FAST_CONV_MR*p + i]; |
||||
for (int j = 0; j < FAST_CONV_NR; j++) |
||||
{ |
||||
c[i*ldc+j] += b[FAST_CONV_NR*p + j]*alpha; |
||||
} |
||||
} |
||||
} |
||||
if (ifActiv) |
||||
{ |
||||
for (int i = 0; i < FAST_CONV_MR; i++) |
||||
{ |
||||
for (int j = 0; j < FAST_CONV_NR; j++) |
||||
{ |
||||
float v = c[i*ldc + j]; |
||||
v = std::min(std::max(v, minval), maxval); |
||||
c[i*ldc + j] = v; |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
} |
||||
} // namespace dnn
|
||||
|
||||
namespace opt_NEON |
||||
{ |
||||
#if CV_TRY_NEON |
||||
void convBlock_NEON(int k, const float *a, const float *b, |
||||
float *c, int ldc, const float *bias, |
||||
float minval, float maxval, bool ifActiv) |
||||
{ |
||||
#if FAST_CONV_MR == 4 && FAST_CONV_NR == 12 |
||||
{ |
||||
float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0; |
||||
float32x4_t c3 = vdupq_n_f32(bias[1]), c4 = c3, c5 = c3; |
||||
float32x4_t c6 = vdupq_n_f32(bias[2]), c7 = c6, c8 = c6; |
||||
float32x4_t c9 = vdupq_n_f32(bias[3]), c10 = c9, c11 = c9; |
||||
|
||||
float32x4_t a0 = vdupq_n_f32(0.0f); |
||||
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f); |
||||
|
||||
for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR) |
||||
{ |
||||
a0 = vld1q_f32(a); |
||||
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); |
||||
|
||||
c0 = vfmaq_laneq_f32(c0, b0, a0, 0); |
||||
c1 = vfmaq_laneq_f32(c1, b1, a0, 0); |
||||
c2 = vfmaq_laneq_f32(c2, b2, a0, 0); |
||||
c3 = vfmaq_laneq_f32(c3, b0, a0, 1); |
||||
c4 = vfmaq_laneq_f32(c4, b1, a0, 1); |
||||
c5 = vfmaq_laneq_f32(c5, b2, a0, 1); |
||||
|
||||
c6 = vfmaq_laneq_f32(c6, b0, a0, 2); |
||||
c7 = vfmaq_laneq_f32(c7, b1, a0, 2); |
||||
c8 = vfmaq_laneq_f32(c8, b2, a0, 2); |
||||
|
||||
c9 = vfmaq_laneq_f32(c9, b0, a0, 3); |
||||
c10 = vfmaq_laneq_f32(c10, b1, a0, 3); |
||||
c11 = vfmaq_laneq_f32(c11, b2, a0, 3); |
||||
} |
||||
|
||||
if (ifActiv) |
||||
{ |
||||
b0 = vdupq_n_f32(minval), b1 = vdupq_n_f32(maxval); |
||||
c0 = vminq_f32(vmaxq_f32(c0, b0), b1); |
||||
c1 = vminq_f32(vmaxq_f32(c1, b0), b1); |
||||
c2 = vminq_f32(vmaxq_f32(c2, b0), b1); |
||||
c3 = vminq_f32(vmaxq_f32(c3, b0), b1); |
||||
c4 = vminq_f32(vmaxq_f32(c4, b0), b1); |
||||
c5 = vminq_f32(vmaxq_f32(c5, b0), b1); |
||||
c6 = vminq_f32(vmaxq_f32(c6, b0), b1); |
||||
c7 = vminq_f32(vmaxq_f32(c7, b0), b1); |
||||
c8 = vminq_f32(vmaxq_f32(c8, b0), b1); |
||||
c9 = vminq_f32(vmaxq_f32(c9, b0), b1); |
||||
c10 = vminq_f32(vmaxq_f32(c10, b0), b1); |
||||
c11 = vminq_f32(vmaxq_f32(c11, b0), b1); |
||||
} |
||||
vst1q_f32(c, c0); vst1q_f32(c+4, c1); vst1q_f32(c+8, c2); |
||||
vst1q_f32(c + ldc, c3); vst1q_f32(c + ldc + 4, c4); vst1q_f32(c + ldc + 8, c5); |
||||
vst1q_f32(c + ldc*2, c6); vst1q_f32(c + ldc*2 + 4, c7); vst1q_f32(c + ldc*2 + 8, c8); |
||||
vst1q_f32(c + ldc*3, c9); vst1q_f32(c + ldc*3 + 4, c10); vst1q_f32(c + ldc*3 + 8, c11); |
||||
} |
||||
#elif FAST_CONV_MR == 4 && FAST_CONV_NR == 28 |
||||
{ |
||||
float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0, c24 = c0; |
||||
float32x4_t c6 = vdupq_n_f32(bias[1]), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6, c25 = c6; |
||||
float32x4_t c12 = vdupq_n_f32(bias[2]), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12, c26 = c12; |
||||
float32x4_t c18 = vdupq_n_f32(bias[3]), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18, c27 = c18; |
||||
|
||||
float32x4_t a0 = vdupq_n_f32(0.0f); |
||||
float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f); |
||||
|
||||
for (int p = 0; p < k; p++, a += FAST_CONV_MR) { |
||||
a0 = vld1q_f32(a); |
||||
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); |
||||
b += 12; |
||||
|
||||
c0 = vfmaq_laneq_f32(c0, b0, a0, 0); |
||||
c1 = vfmaq_laneq_f32(c1, b1, a0, 0); |
||||
c2 = vfmaq_laneq_f32(c2, b2, a0, 0); |
||||
c6 = vfmaq_laneq_f32(c6, b0, a0, 1); |
||||
c7 = vfmaq_laneq_f32(c7, b1, a0, 1); |
||||
c8 = vfmaq_laneq_f32(c8, b2, a0, 1); |
||||
c12 = vfmaq_laneq_f32(c12, b0, a0, 2); |
||||
c13 = vfmaq_laneq_f32(c13, b1, a0, 2); |
||||
c14 = vfmaq_laneq_f32(c14, b2, a0, 2); |
||||
c18 = vfmaq_laneq_f32(c18, b0, a0, 3); |
||||
c19 = vfmaq_laneq_f32(c19, b1, a0, 3); |
||||
c20 = vfmaq_laneq_f32(c20, b2, a0, 3); |
||||
|
||||
b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); |
||||
b += 12; |
||||
|
||||
c3 = vfmaq_laneq_f32(c3, b0, a0, 0); |
||||
c4 = vfmaq_laneq_f32(c4, b1, a0, 0); |
||||
c5 = vfmaq_laneq_f32(c5, b2, a0, 0); |
||||
|
||||
c9 = vfmaq_laneq_f32(c9, b0, a0, 1); |
||||
c10 = vfmaq_laneq_f32(c10, b1, a0, 1); |
||||
c11 = vfmaq_laneq_f32(c11, b2, a0, 1); |
||||
|
||||
c15 = vfmaq_laneq_f32(c15, b0, a0, 2); |
||||
c16 = vfmaq_laneq_f32(c16, b1, a0, 2); |
||||
c17 = vfmaq_laneq_f32(c17, b2, a0, 2); |
||||
|
||||
c21 = vfmaq_laneq_f32(c21, b0, a0, 3); |
||||
|
||||
b0 = vld1q_f32(b); |
||||
b += 4; |
||||
|
||||
c22 = vfmaq_laneq_f32(c22, b1, a0, 3); |
||||
c23 = vfmaq_laneq_f32(c23, b2, a0, 3); |
||||
|
||||
c24 = vfmaq_laneq_f32(c24, b0, a0, 0); |
||||
c25 = vfmaq_laneq_f32(c25, b0, a0, 1); |
||||
c26 = vfmaq_laneq_f32(c26, b0, a0, 2); |
||||
c27 = vfmaq_laneq_f32(c27, b0, a0, 3); |
||||
} |
||||
|
||||
if (ifActiv) { |
||||
b0 = vdupq_n_f32(minval), b1 = vdupq_n_f32(maxval); |
||||
c0 = vminq_f32(vmaxq_f32(c0, b0), b1); |
||||
c1 = vminq_f32(vmaxq_f32(c1, b0), b1); |
||||
c2 = vminq_f32(vmaxq_f32(c2, b0), b1); |
||||
c3 = vminq_f32(vmaxq_f32(c3, b0), b1); |
||||
c4 = vminq_f32(vmaxq_f32(c4, b0), b1); |
||||
c5 = vminq_f32(vmaxq_f32(c5, b0), b1); |
||||
c6 = vminq_f32(vmaxq_f32(c6, b0), b1); |
||||
c7 = vminq_f32(vmaxq_f32(c7, b0), b1); |
||||
c8 = vminq_f32(vmaxq_f32(c8, b0), b1); |
||||
c9 = vminq_f32(vmaxq_f32(c9, b0), b1); |
||||
c10 = vminq_f32(vmaxq_f32(c10, b0), b1); |
||||
c11 = vminq_f32(vmaxq_f32(c11, b0), b1); |
||||
c12 = vminq_f32(vmaxq_f32(c12, b0), b1); |
||||
c13 = vminq_f32(vmaxq_f32(c13, b0), b1); |
||||
c14 = vminq_f32(vmaxq_f32(c14, b0), b1); |
||||
c15 = vminq_f32(vmaxq_f32(c15, b0), b1); |
||||
c16 = vminq_f32(vmaxq_f32(c16, b0), b1); |
||||
c17 = vminq_f32(vmaxq_f32(c17, b0), b1); |
||||
c18 = vminq_f32(vmaxq_f32(c18, b0), b1); |
||||
c19 = vminq_f32(vmaxq_f32(c19, b0), b1); |
||||
c20 = vminq_f32(vmaxq_f32(c20, b0), b1); |
||||
c21 = vminq_f32(vmaxq_f32(c21, b0), b1); |
||||
c22 = vminq_f32(vmaxq_f32(c22, b0), b1); |
||||
c23 = vminq_f32(vmaxq_f32(c23, b0), b1); |
||||
c24 = vminq_f32(vmaxq_f32(c24, b0), b1); |
||||
c25 = vminq_f32(vmaxq_f32(c25, b0), b1); |
||||
c26 = vminq_f32(vmaxq_f32(c26, b0), b1); |
||||
c27 = vminq_f32(vmaxq_f32(c27, b0), b1); |
||||
} |
||||
vst1q_f32(c, c0); |
||||
vst1q_f32(c + 4, c1); |
||||
vst1q_f32(c + 8, c2); |
||||
vst1q_f32(c + 12, c3); |
||||
vst1q_f32(c + 16, c4); |
||||
vst1q_f32(c + 20, c5); |
||||
vst1q_f32(c + 24, c24); |
||||
|
||||
vst1q_f32(c + ldc, c6); |
||||
vst1q_f32(c + ldc + 4, c7); |
||||
vst1q_f32(c + ldc + 8, c8); |
||||
vst1q_f32(c + ldc + 12, c9); |
||||
vst1q_f32(c + ldc + 16, c10); |
||||
vst1q_f32(c + ldc + 20, c11); |
||||
vst1q_f32(c + ldc + 24, c25); |
||||
|
||||
vst1q_f32(c + ldc * 2, c12); |
||||
vst1q_f32(c + ldc * 2 + 4, c13); |
||||
vst1q_f32(c + ldc * 2 + 8, c14); |
||||
vst1q_f32(c + ldc * 2 + 12, c15); |
||||
vst1q_f32(c + ldc * 2 + 16, c16); |
||||
vst1q_f32(c + ldc * 2 + 20, c17); |
||||
vst1q_f32(c + ldc * 2 + 24, c26); |
||||
|
||||
vst1q_f32(c + ldc * 3, c18); |
||||
vst1q_f32(c + ldc * 3 + 4, c19); |
||||
vst1q_f32(c + ldc * 3 + 8, c20); |
||||
vst1q_f32(c + ldc * 3 + 12, c21); |
||||
vst1q_f32(c + ldc * 3 + 16, c22); |
||||
vst1q_f32(c + ldc * 3 + 20, c23); |
||||
vst1q_f32(c + ldc * 3 + 24, c27); |
||||
} |
||||
#else |
||||
#error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_NEON." |
||||
#endif |
||||
} |
||||
|
||||
#endif |
||||
} // namespace opt_NEON
|
||||
|
||||
} // namespace cv
|
||||
#endif //OPENCV_FAST_CONVOLUTION_SIMD_HPP
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue