lavc/aarch64: new optimization for 8-bit hevc_epel_h

put_hevc_epel_h4_8_c: 67.1
put_hevc_epel_h4_8_i8mm: 21.1
put_hevc_epel_h6_8_c: 147.1
put_hevc_epel_h6_8_i8mm: 45.1
put_hevc_epel_h8_8_c: 237.4
put_hevc_epel_h8_8_i8mm: 72.1
put_hevc_epel_h12_8_c: 527.4
put_hevc_epel_h12_8_i8mm: 115.4
put_hevc_epel_h16_8_c: 943.6
put_hevc_epel_h16_8_i8mm: 153.9
put_hevc_epel_h24_8_c: 2105.4
put_hevc_epel_h24_8_i8mm: 384.4
put_hevc_epel_h32_8_c: 3631.4
put_hevc_epel_h32_8_i8mm: 519.9
put_hevc_epel_h48_8_c: 8082.1
put_hevc_epel_h48_8_i8mm: 1110.4
put_hevc_epel_h64_8_c: 14400.6
put_hevc_epel_h64_8_i8mm: 2057.1

put_hevc_qpel_h4_8_c: 124.9
put_hevc_qpel_h4_8_neon: 43.1
put_hevc_qpel_h4_8_i8mm: 33.1
put_hevc_qpel_h6_8_c: 269.4
put_hevc_qpel_h6_8_neon: 90.6
put_hevc_qpel_h6_8_i8mm: 61.4
put_hevc_qpel_h8_8_c: 477.6
put_hevc_qpel_h8_8_neon: 82.1
put_hevc_qpel_h8_8_i8mm: 99.9
put_hevc_qpel_h12_8_c: 1062.4
put_hevc_qpel_h12_8_neon: 226.9
put_hevc_qpel_h12_8_i8mm: 170.9
put_hevc_qpel_h16_8_c: 1880.6
put_hevc_qpel_h16_8_neon: 302.9
put_hevc_qpel_h16_8_i8mm: 251.4
put_hevc_qpel_h24_8_c: 4221.9
put_hevc_qpel_h24_8_neon: 893.9
put_hevc_qpel_h24_8_i8mm: 626.1
put_hevc_qpel_h32_8_c: 7437.6
put_hevc_qpel_h32_8_neon: 1189.9
put_hevc_qpel_h32_8_i8mm: 959.1
put_hevc_qpel_h48_8_c: 16838.4
put_hevc_qpel_h48_8_neon: 2727.9
put_hevc_qpel_h48_8_i8mm: 2163.9
put_hevc_qpel_h64_8_c: 29982.1
put_hevc_qpel_h64_8_neon: 4777.6

Signed-off-by: Martin Storsjö <martin@martin.st>
pull/389/head
Logan Lyu 2 years ago committed by Martin Storsjö
parent 668eb4c00e
commit d48c89701c
  1. 343
      libavcodec/aarch64/hevcdsp_epel_neon.S
  2. 5
      libavcodec/aarch64/hevcdsp_init_aarch64.c

@ -33,6 +33,349 @@ const epel_filters, align=4
endconst
#if HAVE_I8MM
.macro EPEL_H_HEADER
movrel x5, epel_filters
add x5, x5, x4, lsl #2
ld1r {v30.4s}, [x5]
sub x1, x1, #1
mov x10, #(MAX_PB_SIZE * 2)
.endm
function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
EPEL_H_HEADER
1: ld1 {v4.8b}, [x1], x2
subs w3, w3, #1 // height
ext v5.8b, v4.8b, v4.8b, #1
ext v6.8b, v4.8b, v4.8b, #2
ext v7.8b, v4.8b, v4.8b, #3
trn1 v4.2s, v4.2s, v5.2s
trn1 v6.2s, v6.2s, v7.2s
trn1 v4.2d, v4.2d, v6.2d
movi v16.2d, #0
usdot v16.4s, v4.16b, v30.16b
xtn v16.4h, v16.4s
st1 {v16.4h}, [x0], x10
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
EPEL_H_HEADER
1: ld1 {v4.16b}, [x1], x2
subs w3, w3, #1 // height
ext v5.16b, v4.16b, v4.16b, #1
ext v6.8b, v4.8b, v4.8b, #2
ext v7.8b, v4.8b, v4.8b, #3
trn1 v16.2s, v4.2s, v5.2s
trn2 v17.2s, v4.2s, v5.2s
trn1 v6.2s, v6.2s, v7.2s
trn1 v16.2d, v16.2d, v6.2d
movi v18.2d, #0
movi v19.2d, #0
usdot v18.4s, v16.16b, v30.16b
usdot v19.2s, v17.8b, v30.8b
xtn v18.4h, v18.4s
xtn v19.4h, v19.4s
str d18, [x0]
str s19, [x0, #8]
add x0, x0, x10
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
EPEL_H_HEADER
1: ld1 {v4.16b}, [x1], x2
subs w3, w3, #1 // height
ext v5.16b, v4.16b, v4.16b, #1
ext v6.16b, v4.16b, v4.16b, #2
ext v7.16b, v4.16b, v4.16b, #3
zip1 v20.4s, v4.4s, v6.4s
zip1 v21.4s, v5.4s, v7.4s
movi v16.2d, #0
movi v17.2d, #0
usdot v16.4s, v20.16b, v30.16b
usdot v17.4s, v21.16b, v30.16b
xtn v16.4h, v16.4s
xtn v17.4h, v17.4s
st2 {v16.4h, v17.4h}, [x0], x10
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
EPEL_H_HEADER
1: ld1 {v4.16b}, [x1], x2
subs w3, w3, #1 // height
ext v5.16b, v4.16b, v4.16b, #1
ext v6.16b, v4.16b, v4.16b, #2
ext v7.16b, v4.16b, v4.16b, #3
trn1 v20.2d, v4.2d, v6.2d
trn2 v22.2d, v4.2d, v6.2d
trn1 v21.2d, v5.2d, v7.2d
trn2 v23.2d, v5.2d, v7.2d
trn1 v4.4s, v20.4s, v21.4s
trn2 v5.4s, v20.4s, v21.4s
trn1 v6.4s, v22.4s, v23.4s
movi v16.2d, #0
movi v17.2d, #0
movi v18.2d, #0
usdot v16.4s, v4.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b
xtn v16.4h, v16.4s
xtn2 v16.8h, v17.4s
xtn v18.4h, v18.4s
str q16, [x0]
str d18, [x0, #16]
add x0, x0, x10
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
EPEL_H_HEADER
1: ld1 {v0.16b, v1.16b}, [x1], x2
subs w3, w3, #1 // height
ext v5.16b, v0.16b, v1.16b, #1
ext v6.16b, v0.16b, v1.16b, #2
ext v7.16b, v0.16b, v1.16b, #3
zip1 v20.4s, v0.4s, v6.4s
zip2 v22.4s, v0.4s, v6.4s
zip1 v21.4s, v5.4s, v7.4s
zip2 v23.4s, v5.4s, v7.4s
movi v16.2d, #0
movi v17.2d, #0
movi v18.2d, #0
movi v19.2d, #0
usdot v16.4s, v20.16b, v30.16b
usdot v17.4s, v21.16b, v30.16b
usdot v18.4s, v22.16b, v30.16b
usdot v19.4s, v23.16b, v30.16b
xtn v16.4h, v16.4s
xtn2 v16.8h, v18.4s
xtn v17.4h, v17.4s
xtn2 v17.8h, v19.4s
st2 {v16.8h, v17.8h}, [x0], x10
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
EPEL_H_HEADER
1: ld1 {v0.16b, v1.16b}, [x1], x2
subs w3, w3, #1 // height
ext v5.16b, v0.16b, v1.16b, #1
ext v6.16b, v0.16b, v1.16b, #2
ext v7.16b, v0.16b, v1.16b, #3
ext v26.16b, v1.16b, v1.16b, #1
ext v27.16b, v1.16b, v1.16b, #2
ext v28.16b, v1.16b, v1.16b, #3
movi v16.2d, #0
movi v17.2d, #0
movi v18.2d, #0
movi v19.2d, #0
movi v20.2d, #0
movi v21.2d, #0
movi v22.2d, #0
movi v23.2d, #0
usdot v16.4s, v0.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b
usdot v19.4s, v7.16b, v30.16b
usdot v20.4s, v1.16b, v30.16b
usdot v21.4s, v26.16b, v30.16b
usdot v22.4s, v27.16b, v30.16b
usdot v23.4s, v28.16b, v30.16b
xtn v16.4h, v16.4s
xtn2 v16.8h, v20.4s
xtn v17.4h, v17.4s
xtn2 v17.8h, v21.4s
xtn v18.4h, v18.4s
xtn2 v18.8h, v22.4s
xtn v19.4h, v19.4s
xtn2 v19.8h, v23.4s
zip1 v20.8h, v16.8h, v18.8h
zip1 v21.8h, v17.8h, v19.8h
zip2 v22.8h, v16.8h, v18.8h
zip2 v23.8h, v17.8h, v19.8h
zip1 v22.8h, v22.8h, v23.8h
add x7, x0, #32
st2 {v20.8h, v21.8h}, [x0], x10
st1 {v22.8h}, [x7]
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
EPEL_H_HEADER
1: ld1 {v0.16b, v1.16b, v2.16b}, [x1], x2
subs w3, w3, #1 // height
ext v5.16b, v0.16b, v1.16b, #1
ext v6.16b, v0.16b, v1.16b, #2
ext v7.16b, v0.16b, v1.16b, #3
ext v26.16b, v1.16b, v2.16b, #1
ext v27.16b, v1.16b, v2.16b, #2
ext v28.16b, v1.16b, v2.16b, #3
movi v16.2d, #0
movi v17.2d, #0
movi v18.2d, #0
movi v19.2d, #0
movi v20.2d, #0
movi v21.2d, #0
movi v22.2d, #0
movi v23.2d, #0
usdot v16.4s, v0.16b, v30.16b
usdot v17.4s, v5.16b, v30.16b
usdot v18.4s, v6.16b, v30.16b
usdot v19.4s, v7.16b, v30.16b
usdot v20.4s, v1.16b, v30.16b
usdot v21.4s, v26.16b, v30.16b
usdot v22.4s, v27.16b, v30.16b
usdot v23.4s, v28.16b, v30.16b
xtn v16.4h, v16.4s
xtn2 v16.8h, v20.4s
xtn v17.4h, v17.4s
xtn2 v17.8h, v21.4s
xtn v18.4h, v18.4s
xtn2 v18.8h, v22.4s
xtn v19.4h, v19.4s
xtn2 v19.8h, v23.4s
st4 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
EPEL_H_HEADER
1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
subs w3, w3, #1 // height
ext v4.16b, v0.16b, v1.16b, #1
ext v5.16b, v0.16b, v1.16b, #2
ext v6.16b, v0.16b, v1.16b, #3
ext v16.16b, v1.16b, v2.16b, #1
ext v17.16b, v1.16b, v2.16b, #2
ext v18.16b, v1.16b, v2.16b, #3
movi v20.2d, #0
movi v21.2d, #0
movi v22.2d, #0
movi v23.2d, #0
usdot v20.4s, v0.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b
movi v24.2d, #0
movi v25.2d, #0
movi v26.2d, #0
movi v27.2d, #0
usdot v24.4s, v1.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b
usdot v27.4s, v18.16b, v30.16b
xtn v20.4h, v20.4s
xtn2 v20.8h, v24.4s
xtn v21.4h, v21.4s
xtn2 v21.8h, v25.4s
xtn v22.4h, v22.4s
xtn2 v22.8h, v26.4s
xtn v23.4h, v23.4s
xtn2 v23.8h, v27.4s
st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10
ext v4.16b, v2.16b, v3.16b, #1
ext v5.16b, v2.16b, v3.16b, #2
ext v6.16b, v2.16b, v3.16b, #3
movi v20.2d, #0
movi v21.2d, #0
movi v22.2d, #0
movi v23.2d, #0
usdot v20.4s, v2.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b
xtn v20.4h, v20.4s
xtn2 v20.8h, v22.4s
xtn v21.4h, v21.4s
xtn2 v21.8h, v23.4s
add x7, x0, #64
st2 {v20.8h, v21.8h}, [x7]
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
EPEL_H_HEADER
sub x2, x2, #64
1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
subs w3, w3, #1 // height
ext v4.16b, v0.16b, v1.16b, #1
ext v5.16b, v0.16b, v1.16b, #2
ext v6.16b, v0.16b, v1.16b, #3
ext v16.16b, v1.16b, v2.16b, #1
ext v17.16b, v1.16b, v2.16b, #2
ext v18.16b, v1.16b, v2.16b, #3
movi v20.2d, #0
movi v21.2d, #0
movi v22.2d, #0
movi v23.2d, #0
usdot v20.4s, v0.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b
movi v24.2d, #0
movi v25.2d, #0
movi v26.2d, #0
movi v27.2d, #0
usdot v24.4s, v1.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b
usdot v27.4s, v18.16b, v30.16b
xtn v20.4h, v20.4s
xtn2 v20.8h, v24.4s
xtn v21.4h, v21.4s
xtn2 v21.8h, v25.4s
xtn v22.4h, v22.4s
xtn2 v22.8h, v26.4s
xtn v23.4h, v23.4s
xtn2 v23.8h, v27.4s
st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
ld1 {v7.8b}, [x1], x2
ext v4.16b, v2.16b, v3.16b, #1
ext v5.16b, v2.16b, v3.16b, #2
ext v6.16b, v2.16b, v3.16b, #3
ext v16.16b, v3.16b, v7.16b, #1
ext v17.16b, v3.16b, v7.16b, #2
ext v18.16b, v3.16b, v7.16b, #3
movi v20.2d, #0
movi v21.2d, #0
movi v22.2d, #0
movi v23.2d, #0
usdot v20.4s, v2.16b, v30.16b
usdot v21.4s, v4.16b, v30.16b
usdot v22.4s, v5.16b, v30.16b
usdot v23.4s, v6.16b, v30.16b
movi v24.2d, #0
movi v25.2d, #0
movi v26.2d, #0
movi v27.2d, #0
usdot v24.4s, v3.16b, v30.16b
usdot v25.4s, v16.16b, v30.16b
usdot v26.4s, v17.16b, v30.16b
usdot v27.4s, v18.16b, v30.16b
xtn v20.4h, v20.4s
xtn2 v20.8h, v24.4s
xtn v21.4h, v21.4s
xtn2 v21.8h, v25.4s
xtn v22.4h, v22.4s
xtn2 v22.8h, v26.4s
xtn v23.4h, v23.4s
xtn2 v23.8h, v27.4s
st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
b.ne 1b
ret
endfunc
.macro EPEL_UNI_W_H_HEADER
ldr x12, [sp]
sub x2, x2, #1

@ -171,6 +171,10 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width),);
NEON8_FNPROTO(epel_h, (int16_t *dst,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width), _i8mm);
NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@ -283,6 +287,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
if (have_i8mm(cpu_flags)) {
NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);

Loading…
Cancel
Save