Merge pull request #18036 from alalek:backport_17858

5 years ago · 1067cd0649
parent 5b5c42d2c7 1537ecd931
commit 1067cd0649
2 changed files with 499 additions and 36 deletions
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@ -648,6 +648,7 @@ public:
        bool useAVX;
        bool useAVX2;
        bool useAVX512;
+        int blk_size_cn;

        ParallelConv()
            : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
@ -704,12 +705,17 @@ public:
            p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;

-            int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
-
            int kernel_d = !isConv2D? kernel_size[0] : 1;
            int kernel_h = kernel_size[kernel_size.size() - 2];
            int kernel_w = kernel_size.back();

+            int blk_size_cn0 = cvCeil(800./(kernel_w*kernel_h));
+            int ncn = 16;
+            while (ncn*2 < blk_size_cn0 && ncn < inpCn)
+                ncn *= 2;
+            ncn = std::min(ncn, inpCn);
+            p.blk_size_cn = ncn;
+
            int dil_d = !isConv2D? dilations[0] : 1;
            int dil_h = dilations[dilations.size() - 2];
            int dil_w = dilations.back();
@ -777,18 +783,26 @@ public:
            int dilation_w = dilations.back();

            int i, j, k, d;
-            size_t inpPlaneSize = input_->total(2);
-            size_t outPlaneSize = output_->total(2);
+            int inpPlaneSize = (int)input_->total(2);
+            int outPlaneSize = (int)output_->total(2);
            bool is1x1 = is1x1_;

            int stripesPerSample;
-            size_t stripeSize;
+            int stripeSize;
            Range r = r0;
-
-            if( nstripes >= batchSize*2 )
+            bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
+                outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
+                width >= 16 + dilation_w*(kernel_w - 1);
+            // for now only 3x3 depth-wise convolutions are supported
+            depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
+                // computing at most 1 pixel from each side can involve padding
+                max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
+                pad_l <= 1 && pad_t <= 1;
+
+            if( !depthWiseConvolution && nstripes >= batchSize*2 )
            {
                stripesPerSample = nstripes/batchSize;
-                stripeSize = alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
+                stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
                stripeSize = std::min(stripeSize, outPlaneSize);
            }
            else
@ -807,20 +821,29 @@ public:
            const float* biasptr_ = &biasvec_->at(0);
            const float* reluptr_ = reluslope_->empty() ? 0 : &reluslope_->at(0);
            float* data_out0_ = output_->ptr<float>();
-            size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
-            AutoBuffer<float> rowbuf0_(rowbufsz + valign);
-            float* rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
-
-            // we clear the buffer once; ultimately, it lets us to avoid
-            // tail processing after running the unrolled/vectorized loop.
-            // the main idea is to make sure that the tail (a.k.a. padding) of each row
-            // (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
-            // does not contain NaNs or Infs. Because the padding in the weights
-            // matrix is explicitly initialized with 0's, we handle all other
-            // cases nicely, i.e. we can skip expliciting re-initialization
-            // of the padding - we just retain elements from the previous iteration
-            // of the loop over channels (cn0).
-            memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
+            AutoBuffer<float> rowbuf0_;
+            float* rowbuf0 = 0;
+            bool use_rowbuf = !depthWiseConvolution;
+            int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);
+
+            // im2row buffer is not used for depth-wise convolution
+            if(use_rowbuf)
+            {
+                size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
+                //printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
+                rowbuf0_.allocate(rowbufsz + valign);
+                rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
+                // we clear the buffer once; ultimately, it lets us to avoid
+                // tail processing after running the unrolled/vectorized loop.
+                // the main idea is to make sure that the tail (a.k.a. padding) of each row
+                // (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
+                // does not contain NaNs or Infs. Because the padding in the weights
+                // matrix is explicitly initialized with 0's, we handle all other
+                // cases nicely, i.e. we can skip expliciting re-initialization
+                // of the padding - we just retain elements from the previous iteration
+                // of the loop over channels (cn0).
+                memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
+            }

            for( int stripe = r.start; stripe < r.end; stripe++ )
            {
@ -835,28 +858,213 @@ public:
                const float* wptr_orig = wptr_orig_ + wstep*startOutCn;
                const float* biasptr = biasptr_ + startOutCn;

-                for( int cn0 = 0; cn0 < inpCn; cn0 += BLK_SIZE_CN )
+                for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
                {
-                    int cn1 = std::min(cn0 + BLK_SIZE_CN, inpCn);
+                    int cn1 = std::min(cn0 + blk_size_cn, inpCn);
                    int ncn = cn1 - cn0, vsz = karea*ncn;
                    int vsz_a = (int)alignSize(vsz, valign);
                    const float* wptr = wptr_orig + cn0*karea;
                    // we apply [Channels][P]ReLU (if any) during the final pass only.
                    const float* relu = cn1 == inpCn && reluptr_ ? reluptr_ + startOutCn : 0;

-                    for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
+                    for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
                    {
-                        int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
+                        int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
+                        int bsz = ofs1 - ofs0;

                        int out_d = ofs0 / (outH * outW);
                        int out_i = (ofs0 - out_d * outH * outW) / outW;
                        int out_j = ofs0 % outW;

+                        if (depthWiseConvolution)
+                        {
+                            CV_Assert(out_i == 0 && out_j == 0);
+                            int in_d = out_d * stride_d - pad_d;
+                            const float* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
+                            float* outptr_ = data_out0 + ofs0;
+
+                        #if CV_TRY_AVX2
+                            if(useAVX2)
+                                opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
+                            else
+                        #endif
+                        #if CV_TRY_AVX
+                            if(useAVX)
+                                opt_AVX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
+                            else
+                        #endif
+                            {
+                                const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                                            w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                                            w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+                                int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+                                float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
+
+                                for (int out_i = 0; out_i < outH; out_i++)
+                                {
+                                    int in_i = out_i * stride_h - pad_t, out_j = 0;
+                                    const float* imgptr0 = inptr_ + in_i*width;
+                                    const float* imgptr1 = imgptr0 + dilation_h*width;
+                                    const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
+                                    float out, w00 = w00_, w01 = w01_, w02 = w02_;
+                                    float w20 = w20_, w21 = w21_, w22 = w22_;
+                                    if (in_i < 0)
+                                    {
+                                        w00 = w01 = w02 = 0.f;
+                                        imgptr0 = imgptr1;
+                                    }
+                                    else if (in_i + dilation_h*(kernel_h-1) >= height)
+                                    {
+                                        w20 = w21 = w22 = 0.f;
+                                        imgptr2 = imgptr1;
+                                    }
+                                    float* outptr = outptr_ + out_i*outW;
+                                    if (pad_l > 0)
+                                    {
+                                        out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
+                                              imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
+                                              imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+                                        if (relu)
+                                            out = out > 0.f ? out : out*relu_coeff;
+                                        outptr[0] = out;
+                                        out_j = 1;
+                                    }
+
+                                #if CV_SIMD
+                                    // maybe with AVX or AVX512 strided depthwise convolution
+                                    // can be accelerated with vector code, but with 4xfloat vectors
+                                    // it's hardly the case
+                                    if( stride_w == 1 )
+                                    {
+                                        const int VECSZ = v_float32::nlanes;
+                                        const int out_delta = VECSZ/stride_w;
+                                        v_float32 vw00 = vx_setall_f32(w00), vw01 = vx_setall_f32(w01), vw02 = vx_setall_f32(w02),
+                                                  vw10 = vx_setall_f32(w10), vw11 = vx_setall_f32(w11), vw12 = vx_setall_f32(w12),
+                                                  vw20 = vx_setall_f32(w20), vw21 = vx_setall_f32(w21), vw22 = vx_setall_f32(w22);
+                                        v_float32 z = vx_setzero_f32(), vbias = vx_setall_f32(bias), vrc = vx_setall_f32(relu_coeff);
+                                        for( ; out_j < outW1; out_j += out_delta )
+                                        {
+                                            if (out_j + out_delta > outW1)
+                                            {
+                                                if (out_j <= pad_l)
+                                                    break;
+                                                out_j = outW1 - out_delta;
+                                            }
+                                            int in_j = out_j * stride_w - pad_l;
+                                            v_float32 v00 = vx_load(imgptr0 + in_j),
+                                                      v01 = vx_load(imgptr0 + in_j + dilation_w),
+                                                      v02 = vx_load(imgptr0 + in_j + dilation_w*2),
+                                                      v10 = vx_load(imgptr1 + in_j),
+                                                      v11 = vx_load(imgptr1 + in_j + dilation_w),
+                                                      v12 = vx_load(imgptr1 + in_j + dilation_w*2),
+                                                      v20 = vx_load(imgptr2 + in_j),
+                                                      v21 = vx_load(imgptr2 + in_j + dilation_w),
+                                                      v22 = vx_load(imgptr2 + in_j + dilation_w*2);
+
+                                            v_float32 vout = v00*vw00 + v01*vw01 + v02*vw02 +
+                                                             v10*vw10 + v11*vw11 + v12*vw12 +
+                                                             v20*vw20 + v21*vw21 + v22*vw22 + vbias;
+                                            if (relu)
+                                                vout = v_select(vout > z, vout, vout*vrc);
+                                            vx_store(outptr + out_j, vout);
+                                        }
+                                    }
+                                #endif
+                                    for (; out_j < outW1; out_j++)
+                                    {
+                                        int in_j = out_j * stride_w - pad_l;
+                                        out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
+                                              imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
+                                              imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+                                        if (relu)
+                                            out = out > 0.f ? out : out*relu_coeff;
+                                        outptr[out_j] = out;
+                                    }
+
+                                    for (; out_j < outW; out_j++ )
+                                    {
+                                        int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+                                        float s0 = 1.f, s1 = 1.f, s2 = 1.f;
+                                        if (in_j0 >= width)
+                                        {
+                                            in_j0 = 0;
+                                            s0 = 0.f;
+                                        }
+                                        if (in_j1 >= width)
+                                        {
+                                            in_j1 = 0;
+                                            s1 = 0.f;
+                                        }
+                                        if (in_j2 >= width)
+                                        {
+                                            in_j2 = 0;
+                                            s2 = 0.f;
+                                        }
+                                        out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
+                                              imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
+                                              imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+                                        if (relu)
+                                            out = out > 0.f ? out : out*relu_coeff;
+                                        outptr[out_j] = out;
+                                    }
+                                }
+                            }
+                            continue;
+                        }
+
                        // do im2row for a part of input tensor
                        float* rowbuf = rowbuf0;

                        if (isConv2D)
                        {
+                            if( is1x1 && stride_w == 1 && stride_h == 1 )
+                            {
+                                const float* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
+                                for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
+                                {
+                                    if( j + 4 <= bsz )
+                                    {
+                                        k = 0;
+                                    #if CV_SIMD128
+                                        for( ; k <= vsz - 4; k += 4 )
+                                        {
+                                            const float* inp = imgptr + j + k*inpPlaneSize;
+                                            v_float32x4 p0 = v_load(inp), p1 = v_load(inp + inpPlaneSize);
+                                            v_float32x4 p2 = v_load(inp + inpPlaneSize*2), p3 = v_load(inp + inpPlaneSize*3);
+                                            v_float32x4 r0, r1, r2, r3;
+                                            v_transpose4x4(p0, p1, p2, p3, r0, r1, r2, r3);
+                                            v_store(rowbuf + k, r0);
+                                            v_store(rowbuf + k + vsz_a, r1);
+                                            v_store(rowbuf + k + vsz_a*2, r2);
+                                            v_store(rowbuf + k + vsz_a*3, r3);
+                                        }
+                                    #endif
+                                        for( ; k < vsz; k++ )
+                                        {
+                                            const float* inp = imgptr + j + k*inpPlaneSize;
+                                            float v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
+                                            rowbuf[k] = v0;
+                                            rowbuf[k + vsz_a] = v1;
+                                            rowbuf[k + vsz_a*2] = v2;
+                                            rowbuf[k + vsz_a*3] = v3;
+                                        }
+                                        j += 3;
+                                        rowbuf += vsz_a*3;
+                                    }
+                                    else
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                        {
+                                            rowbuf[k] = imgptr[j + k*inpPlaneSize];
+                                        }
+                                    }
+                                }
+                            }
+                            else
                            for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
                            {
                                int delta = std::min(ofs1 - ofs, outW - out_j);
@ -976,7 +1184,6 @@ public:

                        // now compute dot product of the weights
                        // and im2row-transformed part of the tensor
-                        int bsz = ofs1 - ofs0;
                    #if CV_TRY_AVX512_SKX
                        /* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */
                        if(useAVX512)
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@ -50,6 +50,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
               const float* rowbuf, float* output, const int* outShape,
               int blockSize, int vecsize, int vecsize_aligned,
               const float* relu, bool initOutput );
+void fastDepthwiseConv( const float* weights,
+                      int kernel_h, int kernel_w,
+                      int stride_h, int stride_w,
+                      int dilation_h, int dilation_w,
+                      int pad_t, int pad_l,
+                      const float* bias, const float* relu,
+                      const float* inptr,
+                      int height, int width,
+                      float* outptr,
+                      int out_d, int outH, int outW );
 void fastGEMM1T( const float* vec, const float* weights,
                 size_t wstep, const float* bias,
                 float* dst, int nvecs, int vecsize );
@ -64,6 +74,8 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
 #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
 #endif

+enum { FASCONV_BASE_VECSZ = 4 };
+
 void fastConv( const float* weights, size_t wstep, const float* bias,
               const float* rowbuf, float* output, const int* outShape,
               int blockSize, int vecsize, int vecsize_aligned,
@ -73,6 +85,11 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
    size_t outPlaneSize = outShape[2]*outShape[3];
    float r0 = 1.f, r1 = 1.f, r2 = 1.f;
    __m128 vr0 = _mm_set1_ps(1.f), vr1 = vr0, vr2 = vr0, z = _mm_setzero_ps();
+    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
+    int rsz = blockSize % FASCONV_BASE_VECSZ;
+    for( int i = 0; i < rsz; i++ )
+        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
+    __m128 mask = _mm_loadu_ps((const float*)maskbuf);

    // now compute dot product of the weights
    // and im2row-transformed part of the tensor
@ -114,8 +131,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
        }

        int j = 0;
-        for( ; j <= blockSize - 4; j += 4 )
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
        {
+            bool tail = false;
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                if (j == 0)
+                    break;
+                j = blockSize - FASCONV_BASE_VECSZ;
+                tail = true;
+            }
            int k = 0;
            const float* rptr = rowbuf + j*vecsize_aligned;

@ -243,9 +268,16 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
                __m128 m0 = _mm_cmp_ps(s0, z, _CMP_GT_OS);
                __m128 m1 = _mm_cmp_ps(s1, z, _CMP_GT_OS);
                __m128 m2 = _mm_cmp_ps(s2, z, _CMP_GT_OS);
-                s0 = _mm_xor_ps(s0, _mm_andnot_ps(m0, _mm_xor_ps(_mm_mul_ps(s0, vr0), s0)));
-                s1 = _mm_xor_ps(s1, _mm_andnot_ps(m1, _mm_xor_ps(_mm_mul_ps(s1, vr1), s1)));
-                s2 = _mm_xor_ps(s2, _mm_andnot_ps(m2, _mm_xor_ps(_mm_mul_ps(s2, vr2), s2)));
+                s0 = _mm_blendv_ps(_mm_mul_ps(s0, vr0), s0, m0);
+                s1 = _mm_blendv_ps(_mm_mul_ps(s1, vr1), s1, m1);
+                s2 = _mm_blendv_ps(_mm_mul_ps(s2, vr2), s2, m2);
+            }
+
+            if( tail )
+            {
+                s0 = _mm_blendv_ps(_mm_loadu_ps(outptr0 + j), s0, mask);
+                s1 = _mm_blendv_ps(_mm_loadu_ps(outptr1 + j), s1, mask);
+                s2 = _mm_blendv_ps(_mm_loadu_ps(outptr2 + j), s2, mask);
            }

            _mm_storeu_ps(outptr0 + j, s0);
@ -253,9 +285,55 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
            _mm_storeu_ps(outptr2 + j, s2);
        }

+        for( ; j <= blockSize - 2; j += 2 )
+        {
+            const float* rptr0 = rowbuf + j*vecsize_aligned;
+            const float* rptr1 = rowbuf + (j+1)*vecsize_aligned;
+            float s00, s01, s10, s11, s20, s21;
+
+            if( initOutput )
+            {
+                s00 = s01 = bias0;
+                s10 = s11 = bias1;
+                s20 = s21 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j]; s01 = outptr0[j+1];
+                s10 = outptr1[j]; s11 = outptr1[j+1];
+                s20 = outptr2[j]; s21 = outptr2[j+1];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                float r = rptr0[k];
+                s00 += w0*r; s10 += w1*r; s20 += w2*r;
+                r = rptr1[k];
+                s01 += w0*r; s11 += w1*r; s21 += w2*r;
+            }
+
+            if( relu )
+            {
+                s00 = s00 > 0.f ? s00 : s00*r0;
+                s01 = s01 > 0.f ? s01 : s01*r0;
+                s10 = s10 > 0.f ? s10 : s10*r1;
+                s11 = s11 > 0.f ? s11 : s11*r1;
+                s20 = s20 > 0.f ? s20 : s20*r2;
+                s21 = s21 > 0.f ? s21 : s21*r2;
+            }
+
+            outptr0[j] = s00;
+            outptr0[j+1] = s01;
+            outptr1[j] = s10;
+            outptr1[j+1] = s11;
+            outptr2[j] = s20;
+            outptr2[j+1] = s21;
+        }
+
        for( ; j < blockSize; j++ )
        {
-            const float* rptr = rowbuf + j*vecsize_aligned;
+            const float* rptr0 = rowbuf + j*vecsize_aligned;
            float s00, s10, s20;

            if( initOutput )
@ -273,10 +351,9 @@ void fastConv( const float* weights, size_t wstep, const float* bias,

            for( int k = 0; k < vecsize; k++ )
            {
-                float r0 = rptr[k];
-                s00 += wptr0[k]*r0;
-                s10 += wptr1[k]*r0;
-                s20 += wptr2[k]*r0;
+                float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                float r = rptr0[k];
+                s00 += w0*r; s10 += w1*r; s20 += w2*r;
            }

            if( relu )
@ -294,6 +371,185 @@ void fastConv( const float* weights, size_t wstep, const float* bias,
    _mm256_zeroupper();
 }

+static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
+{
+    __m256 t0 = _mm256_loadu_ps(ptr);
+    __m256 t1 = _mm256_loadu_ps(ptr + 8);
+
+    __m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
+    __m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
+    a = _mm256_shuffle_ps(lo, hi, 0x88);
+    b = _mm256_shuffle_ps(lo, hi, 0xdd);
+}
+
+void fastDepthwiseConv( const float* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const float* biasptr, const float* relu,
+                     const float* inptr_,
+                     int height, int width,
+                     float* outptr_,
+                     int out_d, int outH, int outW )
+{
+    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const float* imgptr0 = inptr_ + in_i*width;
+        const float* imgptr1 = imgptr0 + dilation_h*width;
+        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        float out, w00 = w00_, w01 = w01_, w02 = w02_;
+        float w20 = w20_, w21 = w21_, w22 = w22_;
+        if (in_i < 0)
+        {
+            w00 = w01 = w02 = 0.f;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            w20 = w21 = w22 = 0.f;
+            imgptr2 = imgptr1;
+        }
+        float* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
+                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
+                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[0] = out;
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 8;
+            __m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
+                      vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
+                      vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
+            __m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
+
+            if( stride_w == 1 )
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
+                           v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
+                           v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
+                           v10 = _mm256_loadu_ps(imgptr1 + in_j),
+                           v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
+                           v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
+                           v20 = _mm256_loadu_ps(imgptr2 + in_j),
+                           v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
+                           v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
+
+                    __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
+                    __m256 vout1 = _mm256_mul_ps(v01, vw01);
+                    __m256 vout2 = _mm256_mul_ps(v02, vw02);
+
+                    vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
+                    vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
+                    vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
+
+                    vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
+                    vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
+                    vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
+
+                    vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
+                        vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
+                    }
+                    _mm256_storeu_ps(outptr + out_j, vout0);
+                }
+            else
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
+                    __m256 vout1 = _mm256_mul_ps(v01, vw01);
+                    __m256 vout2 = _mm256_mul_ps(v02, vw02);
+
+                    vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
+                    vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
+                    vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
+
+                    vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
+                    vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
+                    vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
+
+                    vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
+                        vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
+                    }
+                    _mm256_storeu_ps(outptr + out_j, vout0);
+                }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
+                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
+                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0.f;
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0.f;
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0.f;
+            }
+            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
+                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
+                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+    }
+    _mm256_zeroupper();
+}
+
 // dst = vec * weights^t + bias
 void fastGEMM1T( const float* vec, const float* weights,
                 size_t wstep, const float* bias,