aarch64/vvc: Add put_pel/put_pel_uni/put_pel_uni_w

put_luma_pixels_8_4x4_c:                                 0.2 ( 1.00x)
put_luma_pixels_8_4x4_neon:                              0.2 ( 1.00x)
put_luma_pixels_8_8x8_c:                                 0.7 ( 1.00x)
put_luma_pixels_8_8x8_neon:                              0.2 ( 3.22x)
put_luma_pixels_8_16x16_c:                               2.2 ( 1.00x)
put_luma_pixels_8_16x16_neon:                            0.2 ( 9.89x)
put_luma_pixels_8_32x32_c:                               8.2 ( 1.00x)
put_luma_pixels_8_32x32_neon:                            1.2 ( 6.71x)
put_luma_pixels_8_64x64_c:                              33.7 ( 1.00x)
put_luma_pixels_8_64x64_neon:                            2.5 (13.63x)
put_luma_pixels_8_128x128_c:                           145.5 ( 1.00x)
put_luma_pixels_8_128x128_neon:                         10.2 (14.23x)
put_uni_pixels_luma_8_4x4_c:                             0.5 ( 1.00x)
put_uni_pixels_luma_8_4x4_neon:                          0.0 ( 0.00x)
put_uni_pixels_luma_8_8x8_c:                             0.5 ( 1.00x)
put_uni_pixels_luma_8_8x8_neon:                          0.2 ( 2.11x)
put_uni_pixels_luma_8_16x16_c:                           1.2 ( 1.00x)
put_uni_pixels_luma_8_16x16_neon:                        0.2 ( 5.44x)
put_uni_pixels_luma_8_32x32_c:                           3.0 ( 1.00x)
put_uni_pixels_luma_8_32x32_neon:                        0.5 ( 6.26x)
put_uni_pixels_luma_8_64x64_c:                           3.0 ( 1.00x)
put_uni_pixels_luma_8_64x64_neon:                        1.7 ( 1.72x)
put_uni_pixels_luma_8_128x128_c:                         6.5 ( 1.00x)
put_uni_pixels_luma_8_128x128_neon:                      6.5 ( 1.00x)
release/7.1
Zhao Zhili 4 months ago committed by Nuo Mi
parent 20f2bf5530
commit 25448d1716
  1. 22
      libavcodec/aarch64/h26x/dsp.h
  2. 189
      libavcodec/aarch64/h26x/epel_neon.S
  3. 81
      libavcodec/aarch64/h26x/qpel_neon.S
  4. 1
      libavcodec/aarch64/vvc/Makefile
  5. 21
      libavcodec/aarch64/vvc/dsp_init.c

@ -248,4 +248,26 @@ NEON8_FNPROTO_PARTIAL_4(qpel, (int16_t *dst, const uint8_t *_src, ptrdiff_t _src
NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width),)
#undef NEON8_FNPROTO_PARTIAL_6
#define NEON8_FNPROTO_PARTIAL_6(fn, args, ext) \
void ff_vvc_put_##fn##4_8_neon##ext args; \
void ff_vvc_put_##fn##8_8_neon##ext args; \
void ff_vvc_put_##fn##16_8_neon##ext args; \
void ff_vvc_put_##fn##32_8_neon##ext args; \
void ff_vvc_put_##fn##64_8_neon##ext args; \
void ff_vvc_put_##fn##128_8_neon##ext args
NEON8_FNPROTO_PARTIAL_6(pel_pixels, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride, int height,
const int8_t *hf, const int8_t *vf, int width),);
NEON8_FNPROTO_PARTIAL_6(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, int height,
const int8_t *hf, const int8_t *vf, int width),);
NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
const int8_t *hf, const int8_t *vf, int width),);
#endif

@ -19,7 +19,8 @@
*/
#include "libavutil/aarch64/asm.S"
#define MAX_PB_SIZE 64
#define HEVC_MAX_PB_SIZE 64
#define VVC_MAX_PB_SIZE 128
const epel_filters, align=4
.byte 0, 0, 0, 0
@ -131,8 +132,13 @@ endconst
b.ne 1b
.endm
function ff_vvc_put_pel_pixels4_8_neon, export=1
mov x7, #(VVC_MAX_PB_SIZE * 2)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2)
mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.s}[0], [x1], x2
ushll v4.8h, v0.8b, #6
subs w3, w3, #1
@ -142,7 +148,7 @@ function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2 - 8)
mov x7, #(HEVC_MAX_PB_SIZE * 2 - 8)
1: ld1 {v0.8b}, [x1], x2
ushll v4.8h, v0.8b, #6
st1 {v4.d}[0], [x0], #8
@ -152,8 +158,13 @@ function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_pixels8_8_neon, export=1
mov x7, #(VVC_MAX_PB_SIZE * 2)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2)
mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b}, [x1], x2
ushll v4.8h, v0.8b, #6
subs w3, w3, #1
@ -163,7 +174,7 @@ function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2 - 16)
mov x7, #(HEVC_MAX_PB_SIZE * 2 - 16)
1: ld1 {v0.8b, v1.8b}, [x1], x2
ushll v4.8h, v0.8b, #6
st1 {v4.8h}, [x0], #16
@ -174,8 +185,13 @@ function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_pixels16_8_neon, export=1
mov x7, #(VVC_MAX_PB_SIZE * 2)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2)
mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b, v1.8b}, [x1], x2
ushll v4.8h, v0.8b, #6
ushll v5.8h, v1.8b, #6
@ -186,7 +202,7 @@ function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2)
mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b-v2.8b}, [x1], x2
ushll v4.8h, v0.8b, #6
ushll v5.8h, v1.8b, #6
@ -197,8 +213,13 @@ function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_pixels32_8_neon, export=1
mov x7, #(VVC_MAX_PB_SIZE * 2)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2)
mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b-v3.8b}, [x1], x2
ushll v4.8h, v0.8b, #6
ushll v5.8h, v1.8b, #6
@ -211,7 +232,7 @@ function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
mov x7, #(MAX_PB_SIZE)
mov x7, #(HEVC_MAX_PB_SIZE)
1: ld1 {v0.16b-v2.16b}, [x1], x2
ushll v4.8h, v0.8b, #6
ushll2 v5.8h, v0.16b, #6
@ -226,26 +247,50 @@ function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
ret
endfunc
function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
1: ld1 {v0.16b-v3.16b}, [x1], x2
.macro put_pel_pixels64_8_neon
ushll v4.8h, v0.8b, #6
ushll2 v5.8h, v0.16b, #6
ushll v6.8h, v1.8b, #6
ushll2 v7.8h, v1.16b, #6
st1 {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE)
st1 {v4.8h-v7.8h}, [x0], #64
ushll v16.8h, v2.8b, #6
ushll2 v17.8h, v2.16b, #6
ushll v18.8h, v3.8b, #6
ushll2 v19.8h, v3.16b, #6
st1 {v16.8h-v19.8h}, [x0], x7
.endm
function ff_vvc_put_pel_pixels64_8_neon, export=1
mov x7, #(2 * VVC_MAX_PB_SIZE - 64)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
mov x7, #(HEVC_MAX_PB_SIZE)
1:
ld1 {v0.16b-v3.16b}, [x1], x2
subs w3, w3, #1
st1 {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE)
put_pel_pixels64_8_neon
b.ne 1b
ret
endfunc
function ff_vvc_put_pel_pixels128_8_neon, export=1
mov x7, #64
1:
mov x6, x1
ld1 {v0.16b-v3.16b}, [x6], #64
add x1, x1, x2
subs w3, w3, #1
put_pel_pixels64_8_neon
ld1 {v0.16b-v3.16b}, [x6], #64
put_pel_pixels64_8_neon
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.s}[0], [x2], x3 // src
ushll v16.8h, v0.8b, #6
ld1 {v20.4h}, [x4], x10 // src2
@ -258,7 +303,7 @@ function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
sub x1, x1, #4
1: ld1 {v0.8b}, [x2], x3
ushll v16.8h, v0.8b, #6
@ -273,7 +318,7 @@ function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6
ld1 {v20.8h}, [x4], x10 // src2
@ -286,7 +331,7 @@ function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
sub x1, x1, #8
1: ld1 {v0.16b}, [x2], x3
ushll v16.8h, v0.8b, #6
@ -304,7 +349,7 @@ function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.16b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
@ -320,7 +365,7 @@ function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b-v2.8b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6
ushll v17.8h, v1.8b, #6
@ -339,7 +384,7 @@ function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.16b-v1.16b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
@ -361,7 +406,7 @@ function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
endfunc
function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
mov x10, #(MAX_PB_SIZE)
mov x10, #(HEVC_MAX_PB_SIZE)
1: ld1 {v0.16b-v2.16b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6
@ -369,7 +414,7 @@ function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
ushll2 v19.8h, v1.16b, #6
ushll v20.8h, v2.8b, #6
ushll2 v21.8h, v2.16b, #6
ld1 {v24.8h-v27.8h}, [x4], #(MAX_PB_SIZE) // src2
ld1 {v24.8h-v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) // src2
sqadd v16.8h, v16.8h, v24.8h
sqadd v17.8h, v17.8h, v25.8h
sqadd v18.8h, v18.8h, v26.8h
@ -399,12 +444,12 @@ function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
ushll2 v21.8h, v2.16b, #6
ushll v22.8h, v3.8b, #6
ushll2 v23.8h, v3.16b, #6
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) // src2
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) // src2
sqadd v16.8h, v16.8h, v24.8h
sqadd v17.8h, v17.8h, v25.8h
sqadd v18.8h, v18.8h, v26.8h
sqadd v19.8h, v19.8h, v27.8h
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(HEVC_MAX_PB_SIZE)
sqadd v20.8h, v20.8h, v24.8h
sqadd v21.8h, v21.8h, v25.8h
sqadd v22.8h, v22.8h, v26.8h
@ -427,7 +472,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1
load_epel_filterb x6, x7
sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v4.8b}, [x2], x3
ext v5.8b, v4.8b, v4.8b, #1
ext v6.8b, v4.8b, v4.8b, #2
@ -446,7 +491,7 @@ function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1
load_epel_filterb x6, x7
sub w1, w1, #4
sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v24.16b}, [x2], x3
ext v26.16b, v24.16b, v24.16b, #1
ext v27.16b, v24.16b, v24.16b, #2
@ -465,7 +510,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1
load_epel_filterb x6, x7
sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v24.16b}, [x2], x3
ext v26.16b, v24.16b, v24.16b, #1
ext v27.16b, v24.16b, v24.16b, #2
@ -484,7 +529,7 @@ function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1
load_epel_filterb x6, x7
sub x1, x1, #8
sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v24.16b}, [x2], x3
ext v26.16b, v24.16b, v24.16b, #1
ext v27.16b, v24.16b, v24.16b, #2
@ -506,7 +551,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1
load_epel_filterb x6, x7
sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ldr q24, [x2]
ldr s25, [x2, #16]
add x2, x2, x3
@ -529,7 +574,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1
load_epel_filterb x6, x7
sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v24.16b, v25.16b}, [x2], x3
ext v26.16b, v24.16b, v25.16b, #1
ext v27.16b, v24.16b, v25.16b, #2
@ -556,7 +601,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1
load_epel_filterb x6, x7
sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ldp q24, q25, [x2]
ldr s26, [x2, #32]
add x2, x2, x3
@ -589,7 +634,7 @@ function ff_hevc_put_hevc_epel_bi_h48_8_neon, export=1
load_epel_filterb x6, x7
sub x2, x2, #1
mov x7, #24
mov x10, #(MAX_PB_SIZE * 2 - 48)
mov x10, #(HEVC_MAX_PB_SIZE * 2 - 48)
1: ld1 {v24.16b, v25.16b, v26.16b}, [x2]
ldr s27, [x2, #48]
add x2, x2, x3
@ -683,7 +728,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1
load_epel_filterb x7, x6
sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.s}[0], [x2], x3
ld1 {v17.s}[0], [x2], x3
ld1 {v18.s}[0], [x2], x3
@ -705,7 +750,7 @@ function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1
load_epel_filterb x7, x6
sub x2, x2, x3
sub x1, x1, #4
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8b}, [x2], x3
ld1 {v17.8b}, [x2], x3
ld1 {v18.8b}, [x2], x3
@ -727,7 +772,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1
load_epel_filterb x7, x6
sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8b}, [x2], x3
ld1 {v17.8b}, [x2], x3
ld1 {v18.8b}, [x2], x3
@ -749,7 +794,7 @@ function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1
load_epel_filterb x7, x6
sub x1, x1, #8
sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.16b}, [x2], x3
ld1 {v17.16b}, [x2], x3
ld1 {v18.16b}, [x2], x3
@ -774,7 +819,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1
load_epel_filterb x7, x6
sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.16b}, [x2], x3
ld1 {v17.16b}, [x2], x3
ld1 {v18.16b}, [x2], x3
@ -798,7 +843,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1
load_epel_filterb x7, x6
sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8b, v17.8b, v18.8b}, [x2], x3
ld1 {v19.8b, v20.8b, v21.8b}, [x2], x3
ld1 {v22.8b, v23.8b, v24.8b}, [x2], x3
@ -825,7 +870,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1
load_epel_filterb x7, x6
sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.16b, v17.16b}, [x2], x3
ld1 {v18.16b, v19.16b}, [x2], x3
ld1 {v20.16b, v21.16b}, [x2], x3
@ -895,7 +940,7 @@ endfunc
function ff_hevc_put_hevc_epel_v4_8_neon, export=1
load_epel_filterb x5, x4
sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr s16, [x1]
ldr s17, [x1, x2]
add x1, x1, x2, lsl #1
@ -915,7 +960,7 @@ endfunc
function ff_hevc_put_hevc_epel_v6_8_neon, export=1
load_epel_filterb x5, x4
sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2 - 8)
mov x10, #(HEVC_MAX_PB_SIZE * 2 - 8)
ldr d16, [x1]
ldr d17, [x1, x2]
add x1, x1, x2, lsl #1
@ -936,7 +981,7 @@ endfunc
function ff_hevc_put_hevc_epel_v8_8_neon, export=1
load_epel_filterb x5, x4
sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr d16, [x1]
ldr d17, [x1, x2]
add x1, x1, x2, lsl #1
@ -956,7 +1001,7 @@ endfunc
function ff_hevc_put_hevc_epel_v12_8_neon, export=1
load_epel_filterb x5, x4
sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr q16, [x1]
ldr q17, [x1, x2]
add x1, x1, x2, lsl #1
@ -980,7 +1025,7 @@ endfunc
function ff_hevc_put_hevc_epel_v16_8_neon, export=1
load_epel_filterb x5, x4
sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr q16, [x1]
ldr q17, [x1, x2]
add x1, x1, x2, lsl #1
@ -1002,7 +1047,7 @@ endfunc
function ff_hevc_put_hevc_epel_v24_8_neon, export=1
load_epel_filterb x5, x4
sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
@ -1025,7 +1070,7 @@ endfunc
function ff_hevc_put_hevc_epel_v32_8_neon, export=1
load_epel_filterb x5, x4
sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.16b, v17.16b}, [x1], x2
ld1 {v18.16b, v19.16b}, [x1], x2
ld1 {v20.16b, v21.16b}, [x1], x2
@ -1327,7 +1372,7 @@ endfunc
add x5, x5, x4, lsl #2
ld1r {v30.4s}, [x5]
sub x1, x1, #1
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
.endm
function ff_hevc_put_hevc_epel_h4_8_neon, export=1
@ -2179,7 +2224,7 @@ DISABLE_I8MM
function hevc_put_hevc_epel_hv4_8_end_neon
load_epel_filterh x5, x4
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr d16, [sp]
ldr d17, [sp, x10]
add sp, sp, x10, lsl #1
@ -2198,7 +2243,7 @@ endfunc
function hevc_put_hevc_epel_hv6_8_end_neon
load_epel_filterh x5, x4
mov x5, #120
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr q16, [sp]
ldr q17, [sp, x10]
add sp, sp, x10, lsl #1
@ -2218,7 +2263,7 @@ endfunc
function hevc_put_hevc_epel_hv8_8_end_neon
load_epel_filterh x5, x4
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr q16, [sp]
ldr q17, [sp, x10]
add sp, sp, x10, lsl #1
@ -2238,7 +2283,7 @@ endfunc
function hevc_put_hevc_epel_hv12_8_end_neon
load_epel_filterh x5, x4
mov x5, #112
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@ -2258,7 +2303,7 @@ endfunc
function hevc_put_hevc_epel_hv16_8_end_neon
load_epel_filterh x5, x4
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@ -2278,7 +2323,7 @@ endfunc
function hevc_put_hevc_epel_hv24_8_end_neon
load_epel_filterh x5, x4
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
@ -2462,7 +2507,7 @@ epel_hv neon
function hevc_put_hevc_epel_uni_hv4_8_end_neon
load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.4h}, [sp], x10
ld1 {v17.4h}, [sp], x10
ld1 {v18.4h}, [sp], x10
@ -2481,7 +2526,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv6_8_end_neon
load_epel_filterh x6, x5
sub x1, x1, #4
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10
@ -2501,7 +2546,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv8_8_end_neon
load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10
@ -2521,7 +2566,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv12_8_end_neon
load_epel_filterh x6, x5
sub x1, x1, #8
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@ -2543,7 +2588,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv16_8_end_neon
load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@ -2565,7 +2610,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv24_8_end_neon
load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
@ -3223,7 +3268,7 @@ DISABLE_I8MM
function hevc_put_hevc_epel_uni_w_hv4_8_end_neon
load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.4h}, [sp], x10
ld1 {v17.4h}, [sp], x10
ld1 {v18.4h}, [sp], x10
@ -3273,7 +3318,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv6_8_end_neon
load_epel_filterh x6, x5
sub x1, x1, #4
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10
@ -3326,7 +3371,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv8_8_end_neon
load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10
@ -3376,7 +3421,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv12_8_end_neon
load_epel_filterh x6, x5
sub x1, x1, #8
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@ -3437,7 +3482,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv16_8_end_neon
load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@ -3498,7 +3543,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv24_8_end_neon
load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
@ -3795,7 +3840,7 @@ epel_uni_w_hv neon
function hevc_put_hevc_epel_bi_hv4_8_end_neon
load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.4h}, [sp], x10
ld1 {v17.4h}, [sp], x10
ld1 {v18.4h}, [sp], x10
@ -3816,7 +3861,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv6_8_end_neon
load_epel_filterh x7, x6
sub x1, x1, #4
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10
@ -3838,7 +3883,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv8_8_end_neon
load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10
@ -3860,7 +3905,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv12_8_end_neon
load_epel_filterh x7, x6
sub x1, x1, #8
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@ -3885,7 +3930,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv16_8_end_neon
load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@ -3910,7 +3955,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv24_8_end_neon
load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
@ -3939,7 +3984,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv32_8_end_neon
load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2)
mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10

@ -1250,6 +1250,10 @@ function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
endfunc
function ff_vvc_put_pel_uni_pixels4_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels4_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
1:
ldr s0, [x2]
@ -1278,6 +1282,10 @@ function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_pixels8_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels8_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
1:
ldr d0, [x2]
@ -1306,6 +1314,10 @@ function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_pixels16_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels16_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
1:
ldr q0, [x2]
@ -1328,6 +1340,10 @@ function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_pixels32_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels32_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
1:
ld1 {v0.16b, v1.16b}, [x2], x3
@ -1346,6 +1362,10 @@ function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_pixels64_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels64_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
1:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
@ -1355,6 +1375,19 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_pixels128_8_neon, export=1
sub x1, x1, #64
sub x3, x3, #64
1:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
subs w4, w4, #1
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
load_qpel_filterb x6, x5
sub x2, x2, x3, lsl #1
@ -1528,6 +1561,10 @@ function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc
function ff_vvc_put_pel_uni_w_pixels4_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
@ -1598,6 +1635,10 @@ function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_w_pixels8_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
@ -1741,7 +1782,9 @@ function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_w_pixels16_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
mov w10, #-6
@ -1803,6 +1846,9 @@ function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_w_pixels32_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
mov w10, #-6
@ -1839,6 +1885,39 @@ function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
ret
endfunc
function ff_vvc_put_pel_uni_w_pixels64_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon)
endfunc
function ff_vvc_put_pel_uni_w_pixels128_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
sub x1, x1, #64
sub x3, x3, #64
1:
mov x11, x2
mov x12, x0
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
subs w4, w4, #1
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
b.ne 1b
ret
endfunc
.macro QPEL_UNI_W_V_HEADER
ldur x12, [sp, #8] // my
sub x2, x2, x3, lsl #1

@ -3,5 +3,6 @@ clean::
OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/dsp_init.o
NEON-OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/alf.o \
aarch64/h26x/epel_neon.o \
aarch64/h26x/qpel_neon.o \
aarch64/h26x/sao_neon.o

@ -46,6 +46,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
return;
if (bd == 8) {
c->inter.put[0][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
c->inter.put[0][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
c->inter.put[0][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
c->inter.put[0][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
c->inter.put[0][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
c->inter.put[0][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon;
c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon;
c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon;
@ -53,6 +60,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[0][5][0][1] =
c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon;
c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
c->inter.put_uni[0][4][0][0] = ff_vvc_put_pel_uni_pixels32_8_neon;
c->inter.put_uni[0][5][0][0] = ff_vvc_put_pel_uni_pixels64_8_neon;
c->inter.put_uni[0][6][0][0] = ff_vvc_put_pel_uni_pixels128_8_neon;
c->inter.put_uni[0][1][0][1] = ff_vvc_put_qpel_uni_h4_8_neon;
c->inter.put_uni[0][2][0][1] = ff_vvc_put_qpel_uni_h8_8_neon;
c->inter.put_uni[0][3][0][1] = ff_vvc_put_qpel_uni_h16_8_neon;
@ -60,6 +74,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put_uni[0][5][0][1] =
c->inter.put_uni[0][6][0][1] = ff_vvc_put_qpel_uni_h32_8_neon;
c->inter.put_uni_w[0][1][0][0] = ff_vvc_put_pel_uni_w_pixels4_8_neon;
c->inter.put_uni_w[0][2][0][0] = ff_vvc_put_pel_uni_w_pixels8_8_neon;
c->inter.put_uni_w[0][3][0][0] = ff_vvc_put_pel_uni_w_pixels16_8_neon;
c->inter.put_uni_w[0][4][0][0] = ff_vvc_put_pel_uni_w_pixels32_8_neon;
c->inter.put_uni_w[0][5][0][0] = ff_vvc_put_pel_uni_w_pixels64_8_neon;
c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;

Loading…
Cancel
Save