aarch64/vvc: Add put_qpel_hx i8mm

Benchmark on Android pixel 8 with -fno-vectorize

put_luma_h_8_4x4_c:                                      0.2 ( 1.00x)
put_luma_h_8_4x4_neon:                                   0.2 ( 1.00x)
put_luma_h_8_4x4_i8mm:                                   0.0 ( 0.00x)
put_luma_h_8_8x8_c:                                      1.5 ( 1.00x)
put_luma_h_8_8x8_neon:                                   0.5 ( 3.00x)
put_luma_h_8_8x8_i8mm:                                   0.5 ( 3.00x)
put_luma_h_8_16x16_c:                                    6.2 ( 1.00x)
put_luma_h_8_16x16_neon:                                 2.0 ( 3.12x)
put_luma_h_8_16x16_i8mm:                                 1.5 ( 4.17x)
put_luma_h_8_32x32_c:                                   25.5 ( 1.00x)
put_luma_h_8_32x32_neon:                                 9.0 ( 2.83x)
put_luma_h_8_32x32_i8mm:                                 6.8 ( 3.78x)
put_luma_h_8_64x64_c:                                   99.8 ( 1.00x)
put_luma_h_8_64x64_neon:                                35.2 ( 2.83x)
put_luma_h_8_64x64_i8mm:                                27.2 ( 3.66x)
put_luma_h_8_128x128_c:                                422.0 ( 1.00x)
put_luma_h_8_128x128_neon:                             138.5 ( 3.05x)
put_luma_h_8_128x128_i8mm:                             109.2 ( 3.86x)
release/7.1
Zhao Zhili 4 months ago committed by Nuo Mi
parent 25448d1716
commit 9f6c8eb412
  1. 4
      libavcodec/aarch64/h26x/dsp.h
  2. 68
      libavcodec/aarch64/h26x/qpel_neon.S
  3. 9
      libavcodec/aarch64/vvc/dsp_init.c

@ -270,4 +270,8 @@ NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
int height, int denom, int wx, int ox,
const int8_t *hf, const int8_t *vf, int width),);
NEON8_FNPROTO_PARTIAL_6(qpel_h, (int16_t * dst,
const uint8_t *_src, ptrdiff_t _srcstride, int height,
const int8_t *hf, const int8_t *vf, int width), _i8mm);
#endif

@ -3516,6 +3516,17 @@ endfunc
sub x1, x1, #3
.endm
.macro VVC_QPEL_H_HEADER
ld1r {v31.2d}, [x4]
sub x1, x1, #3
.endm
function ff_vvc_put_qpel_h4_8_neon_i8mm, export=1
VVC_QPEL_H_HEADER
mov x10, #VVC_MAX_PB_SIZE * 2
b 1f
endfunc
function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
QPEL_H_HEADER
mov x10, #HEVC_MAX_PB_SIZE * 2
@ -3572,6 +3583,12 @@ function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
ret
endfunc
function ff_vvc_put_qpel_h8_8_neon_i8mm, export=1
VVC_QPEL_H_HEADER
mov x10, #VVC_MAX_PB_SIZE * 2
b 1f
endfunc
function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
QPEL_H_HEADER
mov x10, #HEVC_MAX_PB_SIZE * 2
@ -3656,6 +3673,12 @@ function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
ret
endfunc
function ff_vvc_put_qpel_h16_8_neon_i8mm, export=1
VVC_QPEL_H_HEADER
mov x10, #VVC_MAX_PB_SIZE * 2
b 1f
endfunc
function ff_hevc_put_hevc_qpel_h16_8_neon_i8mm, export=1
QPEL_H_HEADER
mov x10, #HEVC_MAX_PB_SIZE * 2
@ -3746,6 +3769,13 @@ function ff_hevc_put_hevc_qpel_h24_8_neon_i8mm, export=1
ret
endfunc
function ff_vvc_put_qpel_h32_8_neon_i8mm, export=1
VVC_QPEL_H_HEADER
mov x10, #VVC_MAX_PB_SIZE * 2
add x15, x0, #32
b 1f
endfunc
function ff_hevc_put_hevc_qpel_h32_8_neon_i8mm, export=1
QPEL_H_HEADER
mov x10, #HEVC_MAX_PB_SIZE * 2
@ -3881,10 +3911,7 @@ function ff_hevc_put_hevc_qpel_h48_8_neon_i8mm, export=1
ret
endfunc
function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
QPEL_H_HEADER
sub x2, x2, #64
1:
.macro put_qpel_h64_8_neon_i8mm
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
ext v1.16b, v16.16b, v17.16b, #1
ext v2.16b, v16.16b, v17.16b, #2
@ -3975,11 +4002,42 @@ function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
sqxtn2 v20.8h, v26.4s
sqxtn v21.4h, v23.4s
sqxtn2 v21.8h, v27.4s
stp q20, q21, [x0], #32
stp q20, q21, [x0]
add x0, x0, x10
.endm
function ff_vvc_put_qpel_h64_8_neon_i8mm, export=1
VVC_QPEL_H_HEADER
mov x10, #(VVC_MAX_PB_SIZE * 2 - 32 * 3)
sub x2, x2, #64
b 1f
endfunc
function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
QPEL_H_HEADER
mov x10, #32
sub x2, x2, #64
1:
put_qpel_h64_8_neon_i8mm
subs w3, w3, #1
b.ne 1b
ret
endfunc
function ff_vvc_put_qpel_h128_8_neon_i8mm, export=1
VVC_QPEL_H_HEADER
sub x11, x2, #128
mov x10, #32
mov x2, #0
1:
put_qpel_h64_8_neon_i8mm
subs w3, w3, #1
put_qpel_h64_8_neon_i8mm
add x1, x1, x11
b.ne 1b
ret
endfunc
DISABLE_I8MM
#endif

@ -88,6 +88,15 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
c->alf.filter[LUMA] = alf_filter_luma_8_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_8_neon;
if (have_i8mm(cpu_flags)) {
c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm;
c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon_i8mm;
c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon_i8mm;
c->inter.put[0][4][0][1] = ff_vvc_put_qpel_h32_8_neon_i8mm;
c->inter.put[0][5][0][1] = ff_vvc_put_qpel_h64_8_neon_i8mm;
c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h128_8_neon_i8mm;
}
} else if (bd == 10) {
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;

Loading…
Cancel
Save