From ed9564106cac5c28472135b7a71b676122d52dd7 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 6 Jul 2017 21:36:59 +0300 Subject: [PATCH] reuse AVX2-optimized kernels for AVX1 CPUs (like IvyBridge) --- modules/dnn/src/layers/convolution_layer.cpp | 17 +- .../dnn/src/layers/fully_connected_layer.cpp | 9 +- modules/dnn/src/layers/layers_common.avx.cpp | 54 +++ modules/dnn/src/layers/layers_common.avx2.cpp | 308 +-------------- modules/dnn/src/layers/layers_common.hpp | 13 + modules/dnn/src/layers/layers_common.simd.hpp | 352 ++++++++++++++++++ 6 files changed, 447 insertions(+), 306 deletions(-) create mode 100644 modules/dnn/src/layers/layers_common.avx.cpp create mode 100644 modules/dnn/src/layers/layers_common.simd.hpp diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index f5c782f4ee..12e38c576b 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -285,11 +285,12 @@ public: const std::vector* reluslope_; const ActivationLayer* activ_; bool is1x1_; + bool useAVX; bool useAVX2; ParallelConv() : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), - biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX2(false) + biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false) {} static void run( const Mat& input, Mat& output, const Mat& weights, @@ -322,6 +323,7 @@ public: int inpCnAll = input.size[1], width = input.size[3], height = input.size[2]; int inpCn = inpCnAll / ngroups; p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0); + p.useAVX = checkHardwareSupport(CPU_AVX); p.useAVX2 = checkHardwareSupport(CPU_AVX2); int ncn = std::min(inpCn, (int)BLK_SIZE_CN); @@ -507,6 +509,12 @@ public: fastConv_avx2(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, relu, cn0 == 0); else + #endif + #if CV_TRY_AVX + if(useAVX) + fastConv_avx(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + outShape, bsz, vsz, vsz_a, relu, cn0 == 0); + else #endif for( int i = 0; i < outCn; i += 2 ) { @@ -795,6 +803,7 @@ public: b_ = &b; c_ = &c; nstripes_ = nstripes; + useAVX = checkHardwareSupport(CPU_AVX); useAVX2 = checkHardwareSupport(CPU_AVX2); } @@ -817,6 +826,11 @@ public: if( useAVX2 ) fastGEMM_avx2( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); else + #endif + #if CV_TRY_AVX + if( useAVX ) + fastGEMM_avx( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); + else #endif for( m = 0; m < mmax; m += 2 ) { @@ -910,6 +924,7 @@ public: const Mat *a_, *b_; Mat* c_; int nstripes_; + bool useAVX; bool useAVX2; }; diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 071593d7f5..f27f39c660 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -119,7 +119,7 @@ public: class FullyConnected : public ParallelLoopBody { public: - FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX2(false) {} + FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false) {} static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, Mat& dstMat, const ActivationLayer* activ, int nstripes) @@ -139,6 +139,7 @@ public: p.dstMat = &dstMat; p.nstripes = nstripes; p.activ = activ; + p.useAVX = checkHardwareSupport(CPU_AVX); p.useAVX2 = checkHardwareSupport(CPU_AVX2); parallel_for_(Range(0, nstripes), p, nstripes); @@ -178,6 +179,11 @@ public: if( useAVX2 ) fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); else + #endif + #if CV_TRY_AVX + if( useAVX ) + fastGEMM1T_avx( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + else #endif { int i = 0; @@ -228,6 +234,7 @@ public: const ActivationLayer* activ; Mat* dstMat; int nstripes; + bool useAVX; bool useAVX2; }; diff --git a/modules/dnn/src/layers/layers_common.avx.cpp b/modules/dnn/src/layers/layers_common.avx.cpp new file mode 100644 index 0000000000..4e0c034eae --- /dev/null +++ b/modules/dnn/src/layers/layers_common.avx.cpp @@ -0,0 +1,54 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include "layers_common.hpp" +#include "opencv2/core/hal/intrin.hpp" + +#define fastConv_some_avx fastConv_avx +#define fastGEMM1T_some_avx fastGEMM1T_avx +#define fastGEMM_some_avx fastGEMM_avx + +#undef _mm256_fmadd_ps +#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) + +#include "layers_common.simd.hpp" diff --git a/modules/dnn/src/layers/layers_common.avx2.cpp b/modules/dnn/src/layers/layers_common.avx2.cpp index 4f0c15fa8c..ef8108cc25 100644 --- a/modules/dnn/src/layers/layers_common.avx2.cpp +++ b/modules/dnn/src/layers/layers_common.avx2.cpp @@ -44,308 +44,8 @@ #include "layers_common.hpp" #include "opencv2/core/hal/intrin.hpp" -namespace cv { -namespace dnn { +#define fastConv_some_avx fastConv_avx2 +#define fastGEMM1T_some_avx fastGEMM1T_avx2 +#define fastGEMM_some_avx fastGEMM_avx2 -void fastConv_avx2( const float* weights, size_t wstep, const float* bias, - const float* rowbuf, float* output, const int* outShape, - int blockSize, int vecsize, int vecsize_aligned, - const float* relu, bool initOutput ) -{ - int outCn = outShape[1]; - size_t outPlaneSize = outShape[2]*outShape[3]; - float r0 = 1.f, r1 = 1.f, r2 = 1.f; - __m256 vr0 = _mm256_set1_ps(1.f), vr1 = vr0, vr2 = vr0, z = _mm256_setzero_ps(); - - // now compute dot product of the weights - // and im2row-transformed part of the tensor - for( int i = 0; i < outCn; i += 3 ) - { - const float* wptr0 = weights + i*wstep; - const float* wptr1 = wptr0 + wstep; - const float* wptr2 = wptr1 + wstep; - float* outptr0 = output + i*outPlaneSize; - float* outptr1 = outptr0 + outPlaneSize; - float* outptr2 = outptr1 + outPlaneSize; - float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; - - if( i+2 >= outCn ) - { - wptr2 = wptr1; - outptr2 = outptr1; - bias2 = bias1; - if( i+1 >= outCn ) - { - wptr2 = wptr1 = wptr0; - outptr2 = outptr1 = outptr0; - bias2 = bias1 = bias0; - } - } - - if( relu ) - { - r0 = relu[i]; - r1 = relu[i+1]; - r2 = relu[i+2]; - vr0 = _mm256_set1_ps(r0); - vr1 = _mm256_set1_ps(r1); - vr2 = _mm256_set1_ps(r2); - } - - int j = 0; - for( ; j <= blockSize - 4; j += 4 ) - { - const float* rptr = rowbuf + j*vecsize_aligned; - - __m256 vs00 = _mm256_setzero_ps(), vs01 = _mm256_setzero_ps(), - vs02 = _mm256_setzero_ps(), vs03 = _mm256_setzero_ps(), - vs10 = _mm256_setzero_ps(), vs11 = _mm256_setzero_ps(), - vs12 = _mm256_setzero_ps(), vs13 = _mm256_setzero_ps(), - vs20 = _mm256_setzero_ps(), vs21 = _mm256_setzero_ps(), - vs22 = _mm256_setzero_ps(), vs23 = _mm256_setzero_ps(); - - for( int k = 0; k < vecsize; k += 8, rptr += 8 ) - { - __m256 w0 = _mm256_load_ps(wptr0 + k); - __m256 w1 = _mm256_load_ps(wptr1 + k); - __m256 w2 = _mm256_load_ps(wptr2 + k); - __m256 r0 = _mm256_load_ps(rptr); - - vs00 = _mm256_fmadd_ps(w0, r0, vs00); - vs10 = _mm256_fmadd_ps(w1, r0, vs10); - vs20 = _mm256_fmadd_ps(w2, r0, vs20); - - r0 = _mm256_load_ps(rptr + vecsize_aligned); - vs01 = _mm256_fmadd_ps(w0, r0, vs01); - vs11 = _mm256_fmadd_ps(w1, r0, vs11); - vs21 = _mm256_fmadd_ps(w2, r0, vs21); - - r0 = _mm256_load_ps(rptr + vecsize_aligned*2); - vs02 = _mm256_fmadd_ps(w0, r0, vs02); - vs12 = _mm256_fmadd_ps(w1, r0, vs12); - vs22 = _mm256_fmadd_ps(w2, r0, vs22); - - r0 = _mm256_load_ps(rptr + vecsize_aligned*3); - vs03 = _mm256_fmadd_ps(w0, r0, vs03); - vs13 = _mm256_fmadd_ps(w1, r0, vs13); - vs23 = _mm256_fmadd_ps(w2, r0, vs23); - } - - __m256 t0 = _mm256_hadd_ps(_mm256_hadd_ps(vs00, vs01), _mm256_hadd_ps(vs02, vs03)); - __m256 t1 = _mm256_hadd_ps(_mm256_hadd_ps(vs10, vs11), _mm256_hadd_ps(vs12, vs13)); - __m256 t2 = _mm256_hadd_ps(_mm256_hadd_ps(vs20, vs21), _mm256_hadd_ps(vs22, vs23)); - - t0 = _mm256_add_ps(t0, _mm256_permute2f128_ps(t0, t0, 1)); - t1 = _mm256_add_ps(t1, _mm256_permute2f128_ps(t1, t1, 1)); - t2 = _mm256_add_ps(t2, _mm256_permute2f128_ps(t2, t2, 1)); - - __m256 s0, s1, s2; - - if( initOutput ) - { - s0 = _mm256_set1_ps(bias0); - s1 = _mm256_set1_ps(bias1); - s2 = _mm256_set1_ps(bias2); - } - else - { - s0 = _mm256_castps128_ps256(_mm_loadu_ps(outptr0 + j)); - s1 = _mm256_castps128_ps256(_mm_loadu_ps(outptr1 + j)); - s2 = _mm256_castps128_ps256(_mm_loadu_ps(outptr2 + j)); - } - - s0 = _mm256_add_ps(s0, t0); - s1 = _mm256_add_ps(s1, t1); - s2 = _mm256_add_ps(s2, t2); - - if( relu ) - { - __m256 m0 = _mm256_cmp_ps(s0, z, _CMP_GT_OS); - __m256 m1 = _mm256_cmp_ps(s1, z, _CMP_GT_OS); - __m256 m2 = _mm256_cmp_ps(s2, z, _CMP_GT_OS); - s0 = _mm256_xor_ps(s0, _mm256_andnot_ps(m0, _mm256_xor_ps(_mm256_mul_ps(s0, vr0), s0))); - s1 = _mm256_xor_ps(s1, _mm256_andnot_ps(m1, _mm256_xor_ps(_mm256_mul_ps(s1, vr1), s1))); - s2 = _mm256_xor_ps(s2, _mm256_andnot_ps(m2, _mm256_xor_ps(_mm256_mul_ps(s2, vr2), s2))); - } - - _mm_storeu_ps(outptr0 + j, _mm256_castps256_ps128(s0)); - _mm_storeu_ps(outptr1 + j, _mm256_castps256_ps128(s1)); - _mm_storeu_ps(outptr2 + j, _mm256_castps256_ps128(s2)); - } - - for( ; j < blockSize; j++ ) - { - const float* rptr = rowbuf + j*vecsize_aligned; - float s00, s10, s20; - - if( initOutput ) - { - s00 = bias0; - s10 = bias1; - s20 = bias2; - } - else - { - s00 = outptr0[j]; - s10 = outptr1[j]; - s20 = outptr2[j]; - } - - for( int k = 0; k < vecsize; k++ ) - { - float r0 = rptr[k]; - s00 += wptr0[k]*r0; - s10 += wptr1[k]*r0; - s20 += wptr2[k]*r0; - } - - if( relu ) - { - s00 = s00 > 0.f ? s00 : s00*r0; - s10 = s10 > 0.f ? s10 : s10*r1; - s20 = s20 > 0.f ? s20 : s20*r2; - } - - outptr0[j] = s00; - outptr1[j] = s10; - outptr2[j] = s20; - } - } - _mm256_zeroupper(); -} - -// dst = vec * weights^t + bias -void fastGEMM1T_avx2( const float* vec, const float* weights, - size_t wstep, const float* bias, - float* dst, int nvecs, int vecsize ) -{ - int i = 0; - - for( ; i <= nvecs - 8; i += 8 ) - { - const float* wptr = weights + i*wstep; - __m256 vs0 = _mm256_setzero_ps(), vs1 = _mm256_setzero_ps(), - vs2 = _mm256_setzero_ps(), vs3 = _mm256_setzero_ps(), - vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(), - vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps(); - - for( int k = 0; k < vecsize; k += 8, wptr += 8 ) - { - __m256 v = _mm256_load_ps(vec + k); - - vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); - vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1); - vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2); - vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3); - vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4); - vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5); - vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6); - vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7); - } - - __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3)); - __m256 s1 = _mm256_hadd_ps(_mm256_hadd_ps(vs4, vs5), _mm256_hadd_ps(vs6, vs7)); - - s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1)); - s1 = _mm256_add_ps(s1, _mm256_permute2f128_ps(s1, s1, 1)); - - s0 = _mm256_add_ps(s0, _mm256_castps128_ps256(_mm_loadu_ps(bias + i))); - s1 = _mm256_add_ps(s1, _mm256_castps128_ps256(_mm_loadu_ps(bias + i + 4))); - - _mm_storeu_ps(dst + i, _mm256_castps256_ps128(s0)); - _mm_storeu_ps(dst + i + 4, _mm256_castps256_ps128(s1)); - } - - float temp = 0.f; - for( ; i < nvecs; i++ ) - { - const float* wptr = weights + i*wstep; - __m256 vs0 = _mm256_setzero_ps(); - - for( int k = 0; k < vecsize; k += 8, wptr += 8 ) - { - __m256 v = _mm256_load_ps(vec + k); - vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); - } - - __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0); - s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1)); - _mm_store_ss(&temp, _mm256_castps256_ps128(s0)); - dst[i] = temp + bias[i]; - } - - _mm256_zeroupper(); -} - -void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr, - size_t bstep, float* cptr, size_t cstep, - int ma, int na, int nb ) -{ - int n = 0; - for( ; n <= nb - 16; n += 16 ) - { - for( int m = 0; m < ma; m += 4 ) - { - const float* aptr0 = aptr + astep*m; - const float* aptr1 = aptr + astep*std::min(m+1, ma-1); - const float* aptr2 = aptr + astep*std::min(m+2, ma-1); - const float* aptr3 = aptr + astep*std::min(m+3, ma-1); - - float* cptr0 = cptr + cstep*m; - float* cptr1 = cptr + cstep*std::min(m+1, ma-1); - float* cptr2 = cptr + cstep*std::min(m+2, ma-1); - float* cptr3 = cptr + cstep*std::min(m+3, ma-1); - - __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); - __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); - __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); - __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); - - for( int k = 0; k < na; k++ ) - { - __m256 a0 = _mm256_set1_ps(aptr0[k]); - __m256 a1 = _mm256_set1_ps(aptr1[k]); - __m256 a2 = _mm256_set1_ps(aptr2[k]); - __m256 a3 = _mm256_set1_ps(aptr3[k]); - __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n); - __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8); - d00 = _mm256_fmadd_ps(a0, b0, d00); - d01 = _mm256_fmadd_ps(a0, b1, d01); - d10 = _mm256_fmadd_ps(a1, b0, d10); - d11 = _mm256_fmadd_ps(a1, b1, d11); - d20 = _mm256_fmadd_ps(a2, b0, d20); - d21 = _mm256_fmadd_ps(a2, b1, d21); - d30 = _mm256_fmadd_ps(a3, b0, d30); - d31 = _mm256_fmadd_ps(a3, b1, d31); - } - - _mm256_storeu_ps(cptr0 + n, d00); - _mm256_storeu_ps(cptr0 + n + 8, d01); - _mm256_storeu_ps(cptr1 + n, d10); - _mm256_storeu_ps(cptr1 + n + 8, d11); - _mm256_storeu_ps(cptr2 + n, d20); - _mm256_storeu_ps(cptr2 + n + 8, d21); - _mm256_storeu_ps(cptr3 + n, d30); - _mm256_storeu_ps(cptr3 + n + 8, d31); - } - } - - for( ; n < nb; n++ ) - { - for( int m = 0; m < ma; m++ ) - { - const float* aptr0 = aptr + astep*m; - float* cptr0 = cptr + cstep*m; - float d0 = 0.f; - - for( int k = 0; k < na; k++ ) - d0 += aptr0[k]*bptr[k*bstep + n]; - - cptr0[n] = d0; - } - } - _mm256_zeroupper(); -} - -} -} +#include "layers_common.simd.hpp" diff --git a/modules/dnn/src/layers/layers_common.hpp b/modules/dnn/src/layers/layers_common.hpp index 06f7825f3d..bbab2756f5 100644 --- a/modules/dnn/src/layers/layers_common.hpp +++ b/modules/dnn/src/layers/layers_common.hpp @@ -64,6 +64,19 @@ void getConvPoolPaddings(const Size& inp, const Size& out, const Size &kernel, const Size &stride, const String &padMode, Size &pad); +#if CV_TRY_AVX +void fastConv_avx(const float* weights, size_t wstep, const float* bias, + const float* rowbuf, float* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, + const float* relu, bool initOutput); +void fastGEMM1T_avx( const float* vec, const float* weights, + size_t wstep, const float* bias, + float* dst, int nvecs, int vecsize ); +void fastGEMM_avx( const float* aptr, size_t astep, const float* bptr0, + size_t bstep, float* cptr, size_t cstep, + int ma, int na, int nb ); +#endif + #if CV_TRY_AVX2 void fastConv_avx2(const float* weights, size_t wstep, const float* bias, const float* rowbuf, float* output, const int* outShape, diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp new file mode 100644 index 0000000000..1110ed0933 --- /dev/null +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -0,0 +1,352 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __DNN_LAYERS_COMMON_SIMD_HPP__ +#define __DNN_LAYERS_COMMON_SIMD_HPP__ + +namespace cv { +namespace dnn { + +void fastConv_some_avx( const float* weights, size_t wstep, const float* bias, + const float* rowbuf, float* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, + const float* relu, bool initOutput ) +{ + int outCn = outShape[1]; + size_t outPlaneSize = outShape[2]*outShape[3]; + float r0 = 1.f, r1 = 1.f, r2 = 1.f; + __m256 vr0 = _mm256_set1_ps(1.f), vr1 = vr0, vr2 = vr0, z = _mm256_setzero_ps(); + + // now compute dot product of the weights + // and im2row-transformed part of the tensor + for( int i = 0; i < outCn; i += 3 ) + { + const float* wptr0 = weights + i*wstep; + const float* wptr1 = wptr0 + wstep; + const float* wptr2 = wptr1 + wstep; + float* outptr0 = output + i*outPlaneSize; + float* outptr1 = outptr0 + outPlaneSize; + float* outptr2 = outptr1 + outPlaneSize; + float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; + + if( i+2 >= outCn ) + { + wptr2 = wptr1; + outptr2 = outptr1; + bias2 = bias1; + if( i+1 >= outCn ) + { + wptr2 = wptr1 = wptr0; + outptr2 = outptr1 = outptr0; + bias2 = bias1 = bias0; + } + } + + if( relu ) + { + r0 = relu[i]; + r1 = relu[i+1]; + r2 = relu[i+2]; + vr0 = _mm256_set1_ps(r0); + vr1 = _mm256_set1_ps(r1); + vr2 = _mm256_set1_ps(r2); + } + + int j = 0; + for( ; j <= blockSize - 4; j += 4 ) + { + const float* rptr = rowbuf + j*vecsize_aligned; + + __m256 vs00 = _mm256_setzero_ps(), vs01 = _mm256_setzero_ps(), + vs02 = _mm256_setzero_ps(), vs03 = _mm256_setzero_ps(), + vs10 = _mm256_setzero_ps(), vs11 = _mm256_setzero_ps(), + vs12 = _mm256_setzero_ps(), vs13 = _mm256_setzero_ps(), + vs20 = _mm256_setzero_ps(), vs21 = _mm256_setzero_ps(), + vs22 = _mm256_setzero_ps(), vs23 = _mm256_setzero_ps(); + + for( int k = 0; k < vecsize; k += 8, rptr += 8 ) + { + __m256 w0 = _mm256_load_ps(wptr0 + k); + __m256 w1 = _mm256_load_ps(wptr1 + k); + __m256 w2 = _mm256_load_ps(wptr2 + k); + __m256 r0 = _mm256_load_ps(rptr); + + vs00 = _mm256_fmadd_ps(w0, r0, vs00); + vs10 = _mm256_fmadd_ps(w1, r0, vs10); + vs20 = _mm256_fmadd_ps(w2, r0, vs20); + + r0 = _mm256_load_ps(rptr + vecsize_aligned); + vs01 = _mm256_fmadd_ps(w0, r0, vs01); + vs11 = _mm256_fmadd_ps(w1, r0, vs11); + vs21 = _mm256_fmadd_ps(w2, r0, vs21); + + r0 = _mm256_load_ps(rptr + vecsize_aligned*2); + vs02 = _mm256_fmadd_ps(w0, r0, vs02); + vs12 = _mm256_fmadd_ps(w1, r0, vs12); + vs22 = _mm256_fmadd_ps(w2, r0, vs22); + + r0 = _mm256_load_ps(rptr + vecsize_aligned*3); + vs03 = _mm256_fmadd_ps(w0, r0, vs03); + vs13 = _mm256_fmadd_ps(w1, r0, vs13); + vs23 = _mm256_fmadd_ps(w2, r0, vs23); + } + + __m256 t0 = _mm256_hadd_ps(_mm256_hadd_ps(vs00, vs01), _mm256_hadd_ps(vs02, vs03)); + __m256 t1 = _mm256_hadd_ps(_mm256_hadd_ps(vs10, vs11), _mm256_hadd_ps(vs12, vs13)); + __m256 t2 = _mm256_hadd_ps(_mm256_hadd_ps(vs20, vs21), _mm256_hadd_ps(vs22, vs23)); + + t0 = _mm256_add_ps(t0, _mm256_permute2f128_ps(t0, t0, 1)); + t1 = _mm256_add_ps(t1, _mm256_permute2f128_ps(t1, t1, 1)); + t2 = _mm256_add_ps(t2, _mm256_permute2f128_ps(t2, t2, 1)); + + __m256 s0, s1, s2; + + if( initOutput ) + { + s0 = _mm256_set1_ps(bias0); + s1 = _mm256_set1_ps(bias1); + s2 = _mm256_set1_ps(bias2); + } + else + { + s0 = _mm256_castps128_ps256(_mm_loadu_ps(outptr0 + j)); + s1 = _mm256_castps128_ps256(_mm_loadu_ps(outptr1 + j)); + s2 = _mm256_castps128_ps256(_mm_loadu_ps(outptr2 + j)); + } + + s0 = _mm256_add_ps(s0, t0); + s1 = _mm256_add_ps(s1, t1); + s2 = _mm256_add_ps(s2, t2); + + if( relu ) + { + __m256 m0 = _mm256_cmp_ps(s0, z, _CMP_GT_OS); + __m256 m1 = _mm256_cmp_ps(s1, z, _CMP_GT_OS); + __m256 m2 = _mm256_cmp_ps(s2, z, _CMP_GT_OS); + s0 = _mm256_xor_ps(s0, _mm256_andnot_ps(m0, _mm256_xor_ps(_mm256_mul_ps(s0, vr0), s0))); + s1 = _mm256_xor_ps(s1, _mm256_andnot_ps(m1, _mm256_xor_ps(_mm256_mul_ps(s1, vr1), s1))); + s2 = _mm256_xor_ps(s2, _mm256_andnot_ps(m2, _mm256_xor_ps(_mm256_mul_ps(s2, vr2), s2))); + } + + _mm_storeu_ps(outptr0 + j, _mm256_castps256_ps128(s0)); + _mm_storeu_ps(outptr1 + j, _mm256_castps256_ps128(s1)); + _mm_storeu_ps(outptr2 + j, _mm256_castps256_ps128(s2)); + } + + for( ; j < blockSize; j++ ) + { + const float* rptr = rowbuf + j*vecsize_aligned; + float s00, s10, s20; + + if( initOutput ) + { + s00 = bias0; + s10 = bias1; + s20 = bias2; + } + else + { + s00 = outptr0[j]; + s10 = outptr1[j]; + s20 = outptr2[j]; + } + + for( int k = 0; k < vecsize; k++ ) + { + float r0 = rptr[k]; + s00 += wptr0[k]*r0; + s10 += wptr1[k]*r0; + s20 += wptr2[k]*r0; + } + + if( relu ) + { + s00 = s00 > 0.f ? s00 : s00*r0; + s10 = s10 > 0.f ? s10 : s10*r1; + s20 = s20 > 0.f ? s20 : s20*r2; + } + + outptr0[j] = s00; + outptr1[j] = s10; + outptr2[j] = s20; + } + } + _mm256_zeroupper(); +} + +// dst = vec * weights^t + bias +void fastGEMM1T_some_avx( const float* vec, const float* weights, + size_t wstep, const float* bias, + float* dst, int nvecs, int vecsize ) +{ + int i = 0; + + for( ; i <= nvecs - 8; i += 8 ) + { + const float* wptr = weights + i*wstep; + __m256 vs0 = _mm256_setzero_ps(), vs1 = _mm256_setzero_ps(), + vs2 = _mm256_setzero_ps(), vs3 = _mm256_setzero_ps(), + vs4 = _mm256_setzero_ps(), vs5 = _mm256_setzero_ps(), + vs6 = _mm256_setzero_ps(), vs7 = _mm256_setzero_ps(); + + for( int k = 0; k < vecsize; k += 8, wptr += 8 ) + { + __m256 v = _mm256_load_ps(vec + k); + + vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); + vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1); + vs2 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*2), v, vs2); + vs3 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*3), v, vs3); + vs4 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*4), v, vs4); + vs5 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*5), v, vs5); + vs6 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*6), v, vs6); + vs7 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep*7), v, vs7); + } + + __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs1), _mm256_hadd_ps(vs2, vs3)); + __m256 s1 = _mm256_hadd_ps(_mm256_hadd_ps(vs4, vs5), _mm256_hadd_ps(vs6, vs7)); + + s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1)); + s1 = _mm256_add_ps(s1, _mm256_permute2f128_ps(s1, s1, 1)); + + s0 = _mm256_add_ps(s0, _mm256_castps128_ps256(_mm_loadu_ps(bias + i))); + s1 = _mm256_add_ps(s1, _mm256_castps128_ps256(_mm_loadu_ps(bias + i + 4))); + + _mm_storeu_ps(dst + i, _mm256_castps256_ps128(s0)); + _mm_storeu_ps(dst + i + 4, _mm256_castps256_ps128(s1)); + } + + float temp = 0.f; + for( ; i < nvecs; i++ ) + { + const float* wptr = weights + i*wstep; + __m256 vs0 = _mm256_setzero_ps(); + + for( int k = 0; k < vecsize; k += 8, wptr += 8 ) + { + __m256 v = _mm256_load_ps(vec + k); + vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0); + } + + __m256 s0 = _mm256_hadd_ps(_mm256_hadd_ps(vs0, vs0), vs0); + s0 = _mm256_add_ps(s0, _mm256_permute2f128_ps(s0, s0, 1)); + _mm_store_ss(&temp, _mm256_castps256_ps128(s0)); + dst[i] = temp + bias[i]; + } + + _mm256_zeroupper(); +} + +void fastGEMM_some_avx( const float* aptr, size_t astep, const float* bptr, + size_t bstep, float* cptr, size_t cstep, + int ma, int na, int nb ) +{ + int n = 0; + for( ; n <= nb - 16; n += 16 ) + { + for( int m = 0; m < ma; m += 4 ) + { + const float* aptr0 = aptr + astep*m; + const float* aptr1 = aptr + astep*std::min(m+1, ma-1); + const float* aptr2 = aptr + astep*std::min(m+2, ma-1); + const float* aptr3 = aptr + astep*std::min(m+3, ma-1); + + float* cptr0 = cptr + cstep*m; + float* cptr1 = cptr + cstep*std::min(m+1, ma-1); + float* cptr2 = cptr + cstep*std::min(m+2, ma-1); + float* cptr3 = cptr + cstep*std::min(m+3, ma-1); + + __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); + __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); + __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); + __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); + + for( int k = 0; k < na; k++ ) + { + __m256 a0 = _mm256_set1_ps(aptr0[k]); + __m256 a1 = _mm256_set1_ps(aptr1[k]); + __m256 a2 = _mm256_set1_ps(aptr2[k]); + __m256 a3 = _mm256_set1_ps(aptr3[k]); + __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n); + __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8); + d00 = _mm256_fmadd_ps(a0, b0, d00); + d01 = _mm256_fmadd_ps(a0, b1, d01); + d10 = _mm256_fmadd_ps(a1, b0, d10); + d11 = _mm256_fmadd_ps(a1, b1, d11); + d20 = _mm256_fmadd_ps(a2, b0, d20); + d21 = _mm256_fmadd_ps(a2, b1, d21); + d30 = _mm256_fmadd_ps(a3, b0, d30); + d31 = _mm256_fmadd_ps(a3, b1, d31); + } + + _mm256_storeu_ps(cptr0 + n, d00); + _mm256_storeu_ps(cptr0 + n + 8, d01); + _mm256_storeu_ps(cptr1 + n, d10); + _mm256_storeu_ps(cptr1 + n + 8, d11); + _mm256_storeu_ps(cptr2 + n, d20); + _mm256_storeu_ps(cptr2 + n + 8, d21); + _mm256_storeu_ps(cptr3 + n, d30); + _mm256_storeu_ps(cptr3 + n + 8, d31); + } + } + + for( ; n < nb; n++ ) + { + for( int m = 0; m < ma; m++ ) + { + const float* aptr0 = aptr + astep*m; + float* cptr0 = cptr + cstep*m; + float d0 = 0.f; + + for( int k = 0; k < na; k++ ) + d0 += aptr0[k]*bptr[k*bstep + n]; + + cptr0[n] = d0; + } + } + _mm256_zeroupper(); +} + +} +} + +#endif