libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI

This commit enabled assembly code with intel AVX512 VNNI and added unit test for sobel filter sobel_c: 4537 sobel_avx512icl 2136 Signed-off-by: bwang30 <bin.wang@intel.com> Signed-off-by: Haihao Xiang <haihao.xiang@intel.com>
2 years ago · 3ab11dc5bb
parent 2d25f33a7e
commit 3ab11dc5bb
9 changed files with 362 additions and 78 deletions
--- a/libavfilter/convolution.h
+++ b/libavfilter/convolution.h
@ -21,6 +21,7 @@
 #ifndef AVFILTER_CONVOLUTION_H
 #define AVFILTER_CONVOLUTION_H
 #include "avfilter.h"
 #include "libavutil/intreadwrite.h"
 enum MatrixMode {
    MATRIX_SQUARE,
@ -61,4 +62,77 @@ typedef struct ConvolutionContext {
 } ConvolutionContext;
 void ff_convolution_init_x86(ConvolutionContext *s);
 void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes);
 static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int stride,
                      int x, int w, int y, int h, int bpc)
 {
    int i;
    for (i = 0; i < 9; i++) {
        int xoff = FFABS(x + ((i % 3) - 1));
        int yoff = FFABS(y + (i / 3) - 1);
        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
        c[i] = src + xoff * bpc + yoff * stride;
    }
 }
 static void filter_sobel(uint8_t *dst, int width,
                         float scale, float delta, const int *const matrix,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride, int size)
 {
    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
    const uint8_t *c3 = c[3], *c5 = c[5];
    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
    int x;
    for (x = 0; x < width; x++) {
        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta);
    }
 }
 static void filter16_sobel(uint8_t *dstp, int width,
                           float scale, float delta, const int *const matrix,
                           const uint8_t *c[], int peak, int radius,
                           int dstride, int stride, int size)
 {
    uint16_t *dst = (uint16_t *)dstp;
    int x;
    for (x = 0; x < width; x++) {
        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * -2 + AV_RN16A(&c[2][2 * x]) * -1 +
                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x]) *  2 + AV_RN16A(&c[8][2 * x]) *  1;
        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x]) * -1 + AV_RN16A(&c[8][2 * x]) *  1;
        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, peak);
    }
 }
 static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int nb_planes)
 {
    for (int i = 0; i < 4; i++) {
        s->filter[i] = filter_sobel;
        s->copy[i] = !((1 << i) & s->planes);
        s->size[i] = 3;
        s->setup[i] = setup_3x3;
        s->rdiv[i] = s->scale;
        s->bias[i] = s->delta;
    }
    if (s->depth > 8)
        for (int i = 0; i < 4; i++)
            s->filter[i] = filter16_sobel;
 #if ARCH_X86_64
    ff_sobel_init_x86(s, depth, nb_planes);
 #endif
 }
 #endif
--- a/libavfilter/vf_convolution.c
+++ b/libavfilter/vf_convolution.c
@ -139,24 +139,6 @@ static void filter16_roberts(uint8_t *dstp, int width,
    }
 }
 static void filter16_sobel(uint8_t *dstp, int width,
                           float scale, float delta, const int *const matrix,
                           const uint8_t *c[], int peak, int radius,
                           int dstride, int stride, int size)
 {
    uint16_t *dst = (uint16_t *)dstp;
    int x;
    for (x = 0; x < width; x++) {
        float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * -2 + AV_RN16A(&c[2][2 * x]) * -1 +
                     AV_RN16A(&c[6][2 * x]) *  1 + AV_RN16A(&c[7][2 * x]) *  2 + AV_RN16A(&c[8][2 * x]) *  1;
        float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) *  1 + AV_RN16A(&c[3][2 * x]) * -2 +
                     AV_RN16A(&c[5][2 * x]) *  2 + AV_RN16A(&c[6][2 * x]) * -1 + AV_RN16A(&c[8][2 * x]) *  1;
        dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, peak);
    }
 }
 static void filter16_scharr(uint8_t *dstp, int width,
                            float scale, float delta, const int *const matrix,
                            const uint8_t *c[], int peak, int radius,
@ -261,26 +243,6 @@ static void filter_roberts(uint8_t *dst, int width,
    }
 }
 static void filter_sobel(uint8_t *dst, int width,
                         float scale, float delta, const int *const matrix,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride, int size)
 {
    const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2];
    const uint8_t *c3 = c[3], *c5 = c[5];
    const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8];
    int x;
    for (x = 0; x < width; x++) {
        float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 +
                     c6[x] *  1 + c7[x] *  2 + c8[x] *  1;
        float sumb = c0[x] * -1 + c2[x] *  1 + c3[x] * -2 +
                     c5[x] *  2 + c6[x] * -1 + c8[x] *  1;
        dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta);
    }
 }
 static void filter_scharr(uint8_t *dst, int width,
                          float scale, float delta, const int *const matrix,
                          const uint8_t *c[], int peak, int radius,
@ -552,22 +514,6 @@ static void filter_column(uint8_t *dst, int height,
    }
 }
 static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int stride,
                      int x, int w, int y, int h, int bpc)
 {
    int i;
    for (i = 0; i < 9; i++) {
        int xoff = FFABS(x + ((i % 3) - 1));
        int yoff = FFABS(y + (i / 3) - 1);
        xoff = xoff >= w ? 2 * w - 1 - xoff : xoff;
        yoff = yoff >= h ? 2 * h - 1 - yoff : yoff;
        c[i] = src + xoff * bpc + yoff * stride;
    }
 }
 static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src, int stride,
                      int x, int w, int y, int h, int bpc)
 {
@ -708,6 +654,18 @@ static int param_init(AVFilterContext *ctx)
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
    int p, i;
    s->depth = desc->comp[0].depth;
    s->max = (1 << s->depth) - 1;
    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
    s->planewidth[0] = s->planewidth[3] = inlink->w;
    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
    s->planeheight[0] = s->planeheight[3] = inlink->h;
    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
    s->nb_threads = ff_filter_get_nb_threads(ctx);
    s->bpc = (s->depth + 7) / 8;
    if (!strcmp(ctx->filter->name, "convolution")) {
        for (i = 0; i < 4; i++) {
            int *matrix = (int *)s->matrix[i];
@ -804,14 +762,7 @@ static int param_init(AVFilterContext *ctx)
            s->bias[i] = s->delta;
        }
    } else if (!strcmp(ctx->filter->name, "sobel")) {
-        for (i = 0; i < 4; i++) {
+        ff_sobel_init(s, s->depth, s->nb_planes);
            s->filter[i] = filter_sobel;
            s->copy[i] = !((1 << i) & s->planes);
            s->size[i] = 3;
            s->setup[i] = setup_3x3;
            s->rdiv[i] = s->scale;
            s->bias[i] = s->delta;
        }
    } else if (!strcmp(ctx->filter->name, "kirsch")) {
        for (i = 0; i < 4; i++) {
            s->filter[i] = filter_kirsch;
@ -832,18 +783,6 @@ static int param_init(AVFilterContext *ctx)
        }
    }
    s->depth = desc->comp[0].depth;
    s->max = (1 << s->depth) - 1;
    s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
    s->planewidth[0] = s->planewidth[3] = inlink->w;
    s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
    s->planeheight[0] = s->planeheight[3] = inlink->h;
    s->nb_planes = av_pix_fmt_count_planes(inlink->format);
    s->nb_threads = ff_filter_get_nb_threads(ctx);
    s->bpc = (s->depth + 7) / 8;
    if (!strcmp(ctx->filter->name, "convolution")) {
        if (s->depth > 8) {
            for (p = 0; p < s->nb_planes; p++) {
@ -870,10 +809,6 @@ static int param_init(AVFilterContext *ctx)
        if (s->depth > 8)
            for (p = 0; p < s->nb_planes; p++)
                s->filter[p] = filter16_roberts;
    } else if (!strcmp(ctx->filter->name, "sobel")) {
        if (s->depth > 8)
            for (p = 0; p < s->nb_planes; p++)
                s->filter[p] = filter16_sobel;
    } else if (!strcmp(ctx->filter->name, "kirsch")) {
        if (s->depth > 8)
            for (p = 0; p < s->nb_planes; p++)
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@ -22,6 +22,18 @@
 SECTION_RODATA
 half:   dd 0.5
 data_p1: dd  1
 data_n1: dd -1
 data_p2: dd  2
 data_n2: dd -2
 ALIGN 64
 sobel_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19, 35, 51
            db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23, 39, 55
            db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
            db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
 sobel_mulA: db -1,  1, -2,  2
 sobel_mulB: db  1, -1,  2, -2
 SECTION .text
@ -154,3 +166,138 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c
 INIT_XMM sse4
 FILTER_3X3
 %endif
 %macro SOBEL_MUL 2
    movzx ptrd, byte [c%1q + xq]
    imul  ptrd, [%2]
    add   rd, ptrd
 %endmacro
 %macro SOBEL_ADD 1
    movzx ptrd, byte [c%1q + xq]
    add   rd, ptrd
 %endmacro
 ; void filter_sobel_avx512(uint8_t *dst, int width,
 ;                      float scale, float delta, const int *const matrix,
 ;                      const uint8_t *c[], int peak, int radius,
 ;                      int dstride, int stride)
 %macro FILTER_SOBEL 0
 %if UNIX64
 cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
 %else
 cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
 %endif
 %if WIN64
    SWAP xmm0, xmm2
    SWAP xmm1, xmm3
    mov  r2q, matrixmp
    mov  r3q, ptrmp
    DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
 %endif
    movsxdifnidn widthq, widthd
    VBROADCASTSS m0, xmm0
    VBROADCASTSS m1, xmm1
    pxor  m6, m6
    mov   c0q, [ptrq + 0*gprsize]
    mov   c1q, [ptrq + 1*gprsize]
    mov   c2q, [ptrq + 2*gprsize]
    mov   c3q, [ptrq + 3*gprsize]
    mov   c4q, [ptrq + 4*gprsize]
    mov   c5q, [ptrq + 5*gprsize]
    mov   c6q, [ptrq + 6*gprsize]
    mov   c7q, [ptrq + 7*gprsize]
    mov   c8q, [ptrq + 8*gprsize]
    xor   xq, xq
    cmp   widthq, mmsize/4
    jl .loop2
    mov   rq, widthq
    and   rq, mmsize/4-1
    sub   widthq, rq
    mova  m6, [sobel_perm]
 .loop1:
    movu          xm3, [c2q + xq]
    pmovzxbd      m5, [c0q + xq]
    vinserti32x4  ym3, [c6q + xq], 1
    pmovzxbd      m4, [c8q + xq]
    vinserti32x4  m2, m3, [c1q + xq], 2
    vinserti32x4  m3, [c5q + xq], 2
    vinserti32x4  m2, [c7q + xq], 3
    vinserti32x4  m3, [c3q + xq], 3
    vpermb        m2, m6, m2
    psubd         m4, m5
    vpermb        m3, m6, m3
    mova          m5, m4
    vpdpbusd      m4, m2, [sobel_mulA] {1to16}
    vpdpbusd      m5, m3, [sobel_mulB] {1to16}
    cvtdq2ps  m4, m4
    mulps     m4, m4
    cvtdq2ps    m5, m5
    VFMADD231PS m4, m5, m5
    sqrtps    m4, m4
    fmaddps m4, m4, m0, m1
    cvttps2dq m4, m4
    vpmovusdb [dstq + xq], m4
    add xq, mmsize/4
    cmp xq, widthq
    jl .loop1
    add widthq, rq
    cmp xq, widthq
    jge .end
 .loop2:
    xor  rd, rd
    pxor m4, m4
    ;Gx
    SOBEL_MUL 0, data_n1
    SOBEL_MUL 1, data_n2
    SOBEL_MUL 2, data_n1
    SOBEL_ADD 6
    SOBEL_MUL 7, data_p2
    SOBEL_ADD 8
    cvtsi2ss xmm4, rd
    mulss    xmm4, xmm4
    xor rd, rd
    ;Gy
    SOBEL_MUL 0, data_n1
    SOBEL_ADD 2
    SOBEL_MUL 3, data_n2
    SOBEL_MUL 5, data_p2
    SOBEL_MUL 6, data_n1
    SOBEL_ADD 8
    cvtsi2ss  xmm5, rd
    fmaddss xmm4, xmm5, xmm5, xmm4
    sqrtps    xmm4, xmm4
    fmaddss   xmm4, xmm4, xmm0, xmm1     ;sum = sum * rdiv + bias
    cvttps2dq xmm4, xmm4     ; trunc to integer
    packssdw  xmm4, xmm4
    packuswb  xmm4, xmm4
    movd      rd, xmm4
    mov       [dstq + xq], rb
    add xq, 1
    cmp xq, widthq
    jl .loop2
 .end:
    RET
 %endmacro
 %if ARCH_X86_64
 %if HAVE_AVX512ICL_EXTERNAL
 INIT_ZMM avx512icl
 FILTER_SOBEL
 %endif
 %endif
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width,
                        const uint8_t *c[], int peak, int radius,
                        int dstride, int stride, int size);
 void ff_filter_sobel_avx512icl(uint8_t *dst, int width,
                         float scale, float delta, const int *const matrix,
                         const uint8_t *c[], int peak, int radius,
                         int dstride, int stride, int size);
 av_cold void ff_convolution_init_x86(ConvolutionContext *s)
 {
 #if ARCH_X86_64
@ -44,3 +49,16 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
    }
 #endif
 }
 av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes)
 {
 #if ARCH_X86_64
    int cpu_flags = av_get_cpu_flags();
    for (int i = 0; i < nb_planes; i++) {
        if (depth == 8) {
            if (EXTERNAL_AVX512ICL(cpu_flags))
                s->filter[i] = ff_filter_sobel_avx512icl;
        }
    }
 #endif
 }
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@ -46,6 +46,7 @@ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
 AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER)  += vf_threshold.o
 AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)    += vf_nlmeans.o
 AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@ -197,6 +197,9 @@ static const struct {
    #if CONFIG_THRESHOLD_FILTER
        { "vf_threshold", checkasm_check_vf_threshold },
    #endif
    #if CONFIG_SOBEL_FILTER
        { "vf_sobel", checkasm_check_vf_sobel },
    #endif
 #endif
 #if CONFIG_SWSCALE
    { "sw_gbrp", checkasm_check_sw_gbrp },
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@ -86,6 +86,7 @@ void checkasm_check_vf_eq(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
 void checkasm_check_vf_sobel(void);
 void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
--- a/tests/checkasm/vf_convolution.c
+++ b/tests/checkasm/vf_convolution.c
@ -0,0 +1,104 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
 #include <string.h>
 #include "checkasm.h"
 #include "libavfilter/avfilter.h"
 #include "libavfilter/convolution.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
 #define WIDTH 512
 #define HEIGHT 512
 #define SRC_STRIDE 512
 #define PIXELS (WIDTH * HEIGHT)
 #define randomize_buffers(buf, size)      \
    do {                                  \
        int j;                            \
        uint8_t *tmp_buf = (uint8_t *)buf;\
        for (j = 0; j< size; j++)         \
            tmp_buf[j] = rnd() & 0xFF;    \
    } while (0)
 static void check_sobel(const char * report_name)
 {
    LOCAL_ALIGNED_32(uint8_t, src,     [PIXELS]);
    LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]);
    LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]);
    const int height = WIDTH;
    const int width  = HEIGHT;
    const int stride = SRC_STRIDE;
    const int dstride = SRC_STRIDE;
    int mode = 0;
    const uint8_t *c[49];
    const int radius = 1;
    const int bpc = 1;
    const int step = mode == MATRIX_COLUMN ? 16 : 1;
    const int slice_start = 0;
    const int slice_end = height;
    int y;
    const int sizew = mode == MATRIX_COLUMN ? height : width;
    float scale = 2;
    float delta = 10;
    ConvolutionContext s;
    declare_func(void, uint8_t *dst, int width, float scale, float delta, const int *const matrix,
                 const uint8_t *c[], int peak, int radius, int dstride, int stride, int size);
    s.scale = scale;
    s.delta = delta;
    s.depth = 8;
    s.nb_planes = 3;
    s.planes = 15;
    ff_sobel_init(&s, s.depth, s.nb_planes);
    memset(dst_ref, 0, PIXELS);
    memset(dst_new, 0, PIXELS);
    randomize_buffers(src, PIXELS);
    if (check_func(s.filter[0], "%s", report_name)) {
        for (y = slice_start; y < slice_end; y += step) {
            const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : radius * bpc;
            const int yoff = mode == MATRIX_COLUMN ? radius * dstride : 0;
            s.setup[0](radius, c, src, stride, radius, width, y, height, bpc);
            call_ref(dst_ref + yoff + xoff, sizew - 2 * radius,
                     scale, delta, NULL, c, 0, radius,
                     dstride, stride, slice_end - step);
            call_new(dst_new + yoff + xoff, sizew - 2 * radius,
                     scale, delta, NULL, c, 0, radius,
                     dstride, stride, slice_end - step);
            if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff, slice_end - step))
                fail();
            bench_new(dst_new + yoff + xoff, sizew - 2 * radius,
                      scale, delta, NULL, c, 0, radius,
                      dstride, stride, slice_end - step);
            if (mode != MATRIX_COLUMN)
                dst_ref += dstride;
        }
    }
 }
 void checkasm_check_vf_sobel(void)
 {
    check_sobel("sobel");
    report("convolution:sobel");
 }
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@ -43,6 +43,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
                fate-checkasm-vf_hflip                                  \
                fate-checkasm-vf_nlmeans                                \
                fate-checkasm-vf_threshold                              \
                fate-checkasm-vf_sobel                                  \
                fate-checkasm-videodsp                                  \
                fate-checkasm-vorbisdsp                                 \
                fate-checkasm-vp8dsp                                    \