From 25448d17160fe844c79d4ec8e659a704c9a57c9d Mon Sep 17 00:00:00 2001 From: Zhao Zhili Date: Sat, 7 Sep 2024 14:07:17 +0800 Subject: [PATCH] aarch64/vvc: Add put_pel/put_pel_uni/put_pel_uni_w put_luma_pixels_8_4x4_c: 0.2 ( 1.00x) put_luma_pixels_8_4x4_neon: 0.2 ( 1.00x) put_luma_pixels_8_8x8_c: 0.7 ( 1.00x) put_luma_pixels_8_8x8_neon: 0.2 ( 3.22x) put_luma_pixels_8_16x16_c: 2.2 ( 1.00x) put_luma_pixels_8_16x16_neon: 0.2 ( 9.89x) put_luma_pixels_8_32x32_c: 8.2 ( 1.00x) put_luma_pixels_8_32x32_neon: 1.2 ( 6.71x) put_luma_pixels_8_64x64_c: 33.7 ( 1.00x) put_luma_pixels_8_64x64_neon: 2.5 (13.63x) put_luma_pixels_8_128x128_c: 145.5 ( 1.00x) put_luma_pixels_8_128x128_neon: 10.2 (14.23x) put_uni_pixels_luma_8_4x4_c: 0.5 ( 1.00x) put_uni_pixels_luma_8_4x4_neon: 0.0 ( 0.00x) put_uni_pixels_luma_8_8x8_c: 0.5 ( 1.00x) put_uni_pixels_luma_8_8x8_neon: 0.2 ( 2.11x) put_uni_pixels_luma_8_16x16_c: 1.2 ( 1.00x) put_uni_pixels_luma_8_16x16_neon: 0.2 ( 5.44x) put_uni_pixels_luma_8_32x32_c: 3.0 ( 1.00x) put_uni_pixels_luma_8_32x32_neon: 0.5 ( 6.26x) put_uni_pixels_luma_8_64x64_c: 3.0 ( 1.00x) put_uni_pixels_luma_8_64x64_neon: 1.7 ( 1.72x) put_uni_pixels_luma_8_128x128_c: 6.5 ( 1.00x) put_uni_pixels_luma_8_128x128_neon: 6.5 ( 1.00x) --- libavcodec/aarch64/h26x/dsp.h | 22 ++++ libavcodec/aarch64/h26x/epel_neon.S | 189 +++++++++++++++++----------- libavcodec/aarch64/h26x/qpel_neon.S | 81 +++++++++++- libavcodec/aarch64/vvc/Makefile | 1 + libavcodec/aarch64/vvc/dsp_init.c | 21 ++++ 5 files changed, 241 insertions(+), 73 deletions(-) diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h index f72746ce03..076d01b477 100644 --- a/libavcodec/aarch64/h26x/dsp.h +++ b/libavcodec/aarch64/h26x/dsp.h @@ -248,4 +248,26 @@ NEON8_FNPROTO_PARTIAL_4(qpel, (int16_t *dst, const uint8_t *_src, ptrdiff_t _src NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width),) +#undef NEON8_FNPROTO_PARTIAL_6 +#define NEON8_FNPROTO_PARTIAL_6(fn, args, ext) \ + void ff_vvc_put_##fn##4_8_neon##ext args; \ + void ff_vvc_put_##fn##8_8_neon##ext args; \ + void ff_vvc_put_##fn##16_8_neon##ext args; \ + void ff_vvc_put_##fn##32_8_neon##ext args; \ + void ff_vvc_put_##fn##64_8_neon##ext args; \ + void ff_vvc_put_##fn##128_8_neon##ext args + +NEON8_FNPROTO_PARTIAL_6(pel_pixels, (int16_t *dst, + const uint8_t *src, ptrdiff_t srcstride, int height, + const int8_t *hf, const int8_t *vf, int width),); + +NEON8_FNPROTO_PARTIAL_6(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int height, + const int8_t *hf, const int8_t *vf, int width),); + +NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + const int8_t *hf, const int8_t *vf, int width),); + #endif diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S index 378b0f7fb2..8ca42a5c3a 100644 --- a/libavcodec/aarch64/h26x/epel_neon.S +++ b/libavcodec/aarch64/h26x/epel_neon.S @@ -19,7 +19,8 @@ */ #include "libavutil/aarch64/asm.S" -#define MAX_PB_SIZE 64 +#define HEVC_MAX_PB_SIZE 64 +#define VVC_MAX_PB_SIZE 128 const epel_filters, align=4 .byte 0, 0, 0, 0 @@ -131,8 +132,13 @@ endconst b.ne 1b .endm +function ff_vvc_put_pel_pixels4_8_neon, export=1 + mov x7, #(VVC_MAX_PB_SIZE * 2) + b 1f +endfunc + function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1 - mov x7, #(MAX_PB_SIZE * 2) + mov x7, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.s}[0], [x1], x2 ushll v4.8h, v0.8b, #6 subs w3, w3, #1 @@ -142,7 +148,7 @@ function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1 - mov x7, #(MAX_PB_SIZE * 2 - 8) + mov x7, #(HEVC_MAX_PB_SIZE * 2 - 8) 1: ld1 {v0.8b}, [x1], x2 ushll v4.8h, v0.8b, #6 st1 {v4.d}[0], [x0], #8 @@ -152,8 +158,13 @@ function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_pixels8_8_neon, export=1 + mov x7, #(VVC_MAX_PB_SIZE * 2) + b 1f +endfunc + function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1 - mov x7, #(MAX_PB_SIZE * 2) + mov x7, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.8b}, [x1], x2 ushll v4.8h, v0.8b, #6 subs w3, w3, #1 @@ -163,7 +174,7 @@ function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1 - mov x7, #(MAX_PB_SIZE * 2 - 16) + mov x7, #(HEVC_MAX_PB_SIZE * 2 - 16) 1: ld1 {v0.8b, v1.8b}, [x1], x2 ushll v4.8h, v0.8b, #6 st1 {v4.8h}, [x0], #16 @@ -174,8 +185,13 @@ function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_pixels16_8_neon, export=1 + mov x7, #(VVC_MAX_PB_SIZE * 2) + b 1f +endfunc + function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1 - mov x7, #(MAX_PB_SIZE * 2) + mov x7, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.8b, v1.8b}, [x1], x2 ushll v4.8h, v0.8b, #6 ushll v5.8h, v1.8b, #6 @@ -186,7 +202,7 @@ function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1 - mov x7, #(MAX_PB_SIZE * 2) + mov x7, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.8b-v2.8b}, [x1], x2 ushll v4.8h, v0.8b, #6 ushll v5.8h, v1.8b, #6 @@ -197,8 +213,13 @@ function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_pixels32_8_neon, export=1 + mov x7, #(VVC_MAX_PB_SIZE * 2) + b 1f +endfunc + function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1 - mov x7, #(MAX_PB_SIZE * 2) + mov x7, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.8b-v3.8b}, [x1], x2 ushll v4.8h, v0.8b, #6 ushll v5.8h, v1.8b, #6 @@ -211,7 +232,7 @@ function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1 - mov x7, #(MAX_PB_SIZE) + mov x7, #(HEVC_MAX_PB_SIZE) 1: ld1 {v0.16b-v2.16b}, [x1], x2 ushll v4.8h, v0.8b, #6 ushll2 v5.8h, v0.16b, #6 @@ -226,26 +247,50 @@ function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1 ret endfunc -function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1 -1: ld1 {v0.16b-v3.16b}, [x1], x2 +.macro put_pel_pixels64_8_neon ushll v4.8h, v0.8b, #6 ushll2 v5.8h, v0.16b, #6 ushll v6.8h, v1.8b, #6 ushll2 v7.8h, v1.16b, #6 - st1 {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE) + st1 {v4.8h-v7.8h}, [x0], #64 ushll v16.8h, v2.8b, #6 ushll2 v17.8h, v2.16b, #6 ushll v18.8h, v3.8b, #6 ushll2 v19.8h, v3.16b, #6 + st1 {v16.8h-v19.8h}, [x0], x7 +.endm + +function ff_vvc_put_pel_pixels64_8_neon, export=1 + mov x7, #(2 * VVC_MAX_PB_SIZE - 64) + b 1f +endfunc + +function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1 + mov x7, #(HEVC_MAX_PB_SIZE) +1: + ld1 {v0.16b-v3.16b}, [x1], x2 subs w3, w3, #1 - st1 {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE) + put_pel_pixels64_8_neon b.ne 1b ret endfunc +function ff_vvc_put_pel_pixels128_8_neon, export=1 + mov x7, #64 +1: + mov x6, x1 + ld1 {v0.16b-v3.16b}, [x6], #64 + add x1, x1, x2 + subs w3, w3, #1 + put_pel_pixels64_8_neon + ld1 {v0.16b-v3.16b}, [x6], #64 + put_pel_pixels64_8_neon + b.ne 1b + ret +endfunc function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.s}[0], [x2], x3 // src ushll v16.8h, v0.8b, #6 ld1 {v20.4h}, [x4], x10 // src2 @@ -258,7 +303,7 @@ function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) sub x1, x1, #4 1: ld1 {v0.8b}, [x2], x3 ushll v16.8h, v0.8b, #6 @@ -273,7 +318,7 @@ function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.8b}, [x2], x3 // src ushll v16.8h, v0.8b, #6 ld1 {v20.8h}, [x4], x10 // src2 @@ -286,7 +331,7 @@ function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) sub x1, x1, #8 1: ld1 {v0.16b}, [x2], x3 ushll v16.8h, v0.8b, #6 @@ -304,7 +349,7 @@ function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.16b}, [x2], x3 // src ushll v16.8h, v0.8b, #6 ushll2 v17.8h, v0.16b, #6 @@ -320,7 +365,7 @@ function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.8b-v2.8b}, [x2], x3 // src ushll v16.8h, v0.8b, #6 ushll v17.8h, v1.8b, #6 @@ -339,7 +384,7 @@ function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v0.16b-v1.16b}, [x2], x3 // src ushll v16.8h, v0.8b, #6 ushll2 v17.8h, v0.16b, #6 @@ -361,7 +406,7 @@ function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1 endfunc function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1 - mov x10, #(MAX_PB_SIZE) + mov x10, #(HEVC_MAX_PB_SIZE) 1: ld1 {v0.16b-v2.16b}, [x2], x3 // src ushll v16.8h, v0.8b, #6 ushll2 v17.8h, v0.16b, #6 @@ -369,7 +414,7 @@ function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1 ushll2 v19.8h, v1.16b, #6 ushll v20.8h, v2.8b, #6 ushll2 v21.8h, v2.16b, #6 - ld1 {v24.8h-v27.8h}, [x4], #(MAX_PB_SIZE) // src2 + ld1 {v24.8h-v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) // src2 sqadd v16.8h, v16.8h, v24.8h sqadd v17.8h, v17.8h, v25.8h sqadd v18.8h, v18.8h, v26.8h @@ -399,12 +444,12 @@ function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1 ushll2 v21.8h, v2.16b, #6 ushll v22.8h, v3.8b, #6 ushll2 v23.8h, v3.16b, #6 - ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) // src2 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) // src2 sqadd v16.8h, v16.8h, v24.8h sqadd v17.8h, v17.8h, v25.8h sqadd v18.8h, v18.8h, v26.8h sqadd v19.8h, v19.8h, v27.8h - ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) sqadd v20.8h, v20.8h, v24.8h sqadd v21.8h, v21.8h, v25.8h sqadd v22.8h, v22.8h, v26.8h @@ -427,7 +472,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1 load_epel_filterb x6, x7 sub x2, x2, #1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v4.8b}, [x2], x3 ext v5.8b, v4.8b, v4.8b, #1 ext v6.8b, v4.8b, v4.8b, #2 @@ -446,7 +491,7 @@ function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1 load_epel_filterb x6, x7 sub w1, w1, #4 sub x2, x2, #1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v24.16b}, [x2], x3 ext v26.16b, v24.16b, v24.16b, #1 ext v27.16b, v24.16b, v24.16b, #2 @@ -465,7 +510,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1 load_epel_filterb x6, x7 sub x2, x2, #1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v24.16b}, [x2], x3 ext v26.16b, v24.16b, v24.16b, #1 ext v27.16b, v24.16b, v24.16b, #2 @@ -484,7 +529,7 @@ function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1 load_epel_filterb x6, x7 sub x1, x1, #8 sub x2, x2, #1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v24.16b}, [x2], x3 ext v26.16b, v24.16b, v24.16b, #1 ext v27.16b, v24.16b, v24.16b, #2 @@ -506,7 +551,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1 load_epel_filterb x6, x7 sub x2, x2, #1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ldr q24, [x2] ldr s25, [x2, #16] add x2, x2, x3 @@ -529,7 +574,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1 load_epel_filterb x6, x7 sub x2, x2, #1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ld1 {v24.16b, v25.16b}, [x2], x3 ext v26.16b, v24.16b, v25.16b, #1 ext v27.16b, v24.16b, v25.16b, #2 @@ -556,7 +601,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1 load_epel_filterb x6, x7 sub x2, x2, #1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) 1: ldp q24, q25, [x2] ldr s26, [x2, #32] add x2, x2, x3 @@ -589,7 +634,7 @@ function ff_hevc_put_hevc_epel_bi_h48_8_neon, export=1 load_epel_filterb x6, x7 sub x2, x2, #1 mov x7, #24 - mov x10, #(MAX_PB_SIZE * 2 - 48) + mov x10, #(HEVC_MAX_PB_SIZE * 2 - 48) 1: ld1 {v24.16b, v25.16b, v26.16b}, [x2] ldr s27, [x2, #48] add x2, x2, x3 @@ -683,7 +728,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1 load_epel_filterb x7, x6 sub x2, x2, x3 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.s}[0], [x2], x3 ld1 {v17.s}[0], [x2], x3 ld1 {v18.s}[0], [x2], x3 @@ -705,7 +750,7 @@ function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1 load_epel_filterb x7, x6 sub x2, x2, x3 sub x1, x1, #4 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8b}, [x2], x3 ld1 {v17.8b}, [x2], x3 ld1 {v18.8b}, [x2], x3 @@ -727,7 +772,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1 load_epel_filterb x7, x6 sub x2, x2, x3 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8b}, [x2], x3 ld1 {v17.8b}, [x2], x3 ld1 {v18.8b}, [x2], x3 @@ -749,7 +794,7 @@ function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1 load_epel_filterb x7, x6 sub x1, x1, #8 sub x2, x2, x3 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.16b}, [x2], x3 ld1 {v17.16b}, [x2], x3 ld1 {v18.16b}, [x2], x3 @@ -774,7 +819,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1 load_epel_filterb x7, x6 sub x2, x2, x3 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.16b}, [x2], x3 ld1 {v17.16b}, [x2], x3 ld1 {v18.16b}, [x2], x3 @@ -798,7 +843,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1 load_epel_filterb x7, x6 sub x2, x2, x3 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8b, v17.8b, v18.8b}, [x2], x3 ld1 {v19.8b, v20.8b, v21.8b}, [x2], x3 ld1 {v22.8b, v23.8b, v24.8b}, [x2], x3 @@ -825,7 +870,7 @@ endfunc function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1 load_epel_filterb x7, x6 sub x2, x2, x3 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.16b, v17.16b}, [x2], x3 ld1 {v18.16b, v19.16b}, [x2], x3 ld1 {v20.16b, v21.16b}, [x2], x3 @@ -895,7 +940,7 @@ endfunc function ff_hevc_put_hevc_epel_v4_8_neon, export=1 load_epel_filterb x5, x4 sub x1, x1, x2 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ldr s16, [x1] ldr s17, [x1, x2] add x1, x1, x2, lsl #1 @@ -915,7 +960,7 @@ endfunc function ff_hevc_put_hevc_epel_v6_8_neon, export=1 load_epel_filterb x5, x4 sub x1, x1, x2 - mov x10, #(MAX_PB_SIZE * 2 - 8) + mov x10, #(HEVC_MAX_PB_SIZE * 2 - 8) ldr d16, [x1] ldr d17, [x1, x2] add x1, x1, x2, lsl #1 @@ -936,7 +981,7 @@ endfunc function ff_hevc_put_hevc_epel_v8_8_neon, export=1 load_epel_filterb x5, x4 sub x1, x1, x2 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ldr d16, [x1] ldr d17, [x1, x2] add x1, x1, x2, lsl #1 @@ -956,7 +1001,7 @@ endfunc function ff_hevc_put_hevc_epel_v12_8_neon, export=1 load_epel_filterb x5, x4 sub x1, x1, x2 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ldr q16, [x1] ldr q17, [x1, x2] add x1, x1, x2, lsl #1 @@ -980,7 +1025,7 @@ endfunc function ff_hevc_put_hevc_epel_v16_8_neon, export=1 load_epel_filterb x5, x4 sub x1, x1, x2 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ldr q16, [x1] ldr q17, [x1, x2] add x1, x1, x2, lsl #1 @@ -1002,7 +1047,7 @@ endfunc function ff_hevc_put_hevc_epel_v24_8_neon, export=1 load_epel_filterb x5, x4 sub x1, x1, x2 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2 ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2 ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2 @@ -1025,7 +1070,7 @@ endfunc function ff_hevc_put_hevc_epel_v32_8_neon, export=1 load_epel_filterb x5, x4 sub x1, x1, x2 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.16b, v17.16b}, [x1], x2 ld1 {v18.16b, v19.16b}, [x1], x2 ld1 {v20.16b, v21.16b}, [x1], x2 @@ -1327,7 +1372,7 @@ endfunc add x5, x5, x4, lsl #2 ld1r {v30.4s}, [x5] sub x1, x1, #1 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) .endm function ff_hevc_put_hevc_epel_h4_8_neon, export=1 @@ -2179,7 +2224,7 @@ DISABLE_I8MM function hevc_put_hevc_epel_hv4_8_end_neon load_epel_filterh x5, x4 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ldr d16, [sp] ldr d17, [sp, x10] add sp, sp, x10, lsl #1 @@ -2198,7 +2243,7 @@ endfunc function hevc_put_hevc_epel_hv6_8_end_neon load_epel_filterh x5, x4 mov x5, #120 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ldr q16, [sp] ldr q17, [sp, x10] add sp, sp, x10, lsl #1 @@ -2218,7 +2263,7 @@ endfunc function hevc_put_hevc_epel_hv8_8_end_neon load_epel_filterh x5, x4 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ldr q16, [sp] ldr q17, [sp, x10] add sp, sp, x10, lsl #1 @@ -2238,7 +2283,7 @@ endfunc function hevc_put_hevc_epel_hv12_8_end_neon load_epel_filterh x5, x4 mov x5, #112 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -2258,7 +2303,7 @@ endfunc function hevc_put_hevc_epel_hv16_8_end_neon load_epel_filterh x5, x4 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -2278,7 +2323,7 @@ endfunc function hevc_put_hevc_epel_hv24_8_end_neon load_epel_filterh x5, x4 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 @@ -2462,7 +2507,7 @@ epel_hv neon function hevc_put_hevc_epel_uni_hv4_8_end_neon load_epel_filterh x6, x5 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.4h}, [sp], x10 ld1 {v17.4h}, [sp], x10 ld1 {v18.4h}, [sp], x10 @@ -2481,7 +2526,7 @@ endfunc function hevc_put_hevc_epel_uni_hv6_8_end_neon load_epel_filterh x6, x5 sub x1, x1, #4 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10 @@ -2501,7 +2546,7 @@ endfunc function hevc_put_hevc_epel_uni_hv8_8_end_neon load_epel_filterh x6, x5 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10 @@ -2521,7 +2566,7 @@ endfunc function hevc_put_hevc_epel_uni_hv12_8_end_neon load_epel_filterh x6, x5 sub x1, x1, #8 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -2543,7 +2588,7 @@ endfunc function hevc_put_hevc_epel_uni_hv16_8_end_neon load_epel_filterh x6, x5 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -2565,7 +2610,7 @@ endfunc function hevc_put_hevc_epel_uni_hv24_8_end_neon load_epel_filterh x6, x5 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 @@ -3223,7 +3268,7 @@ DISABLE_I8MM function hevc_put_hevc_epel_uni_w_hv4_8_end_neon load_epel_filterh x6, x5 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.4h}, [sp], x10 ld1 {v17.4h}, [sp], x10 ld1 {v18.4h}, [sp], x10 @@ -3273,7 +3318,7 @@ endfunc function hevc_put_hevc_epel_uni_w_hv6_8_end_neon load_epel_filterh x6, x5 sub x1, x1, #4 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10 @@ -3326,7 +3371,7 @@ endfunc function hevc_put_hevc_epel_uni_w_hv8_8_end_neon load_epel_filterh x6, x5 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10 @@ -3376,7 +3421,7 @@ endfunc function hevc_put_hevc_epel_uni_w_hv12_8_end_neon load_epel_filterh x6, x5 sub x1, x1, #8 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -3437,7 +3482,7 @@ endfunc function hevc_put_hevc_epel_uni_w_hv16_8_end_neon load_epel_filterh x6, x5 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -3498,7 +3543,7 @@ endfunc function hevc_put_hevc_epel_uni_w_hv24_8_end_neon load_epel_filterh x6, x5 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 @@ -3795,7 +3840,7 @@ epel_uni_w_hv neon function hevc_put_hevc_epel_bi_hv4_8_end_neon load_epel_filterh x7, x6 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.4h}, [sp], x10 ld1 {v17.4h}, [sp], x10 ld1 {v18.4h}, [sp], x10 @@ -3816,7 +3861,7 @@ endfunc function hevc_put_hevc_epel_bi_hv6_8_end_neon load_epel_filterh x7, x6 sub x1, x1, #4 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10 @@ -3838,7 +3883,7 @@ endfunc function hevc_put_hevc_epel_bi_hv8_8_end_neon load_epel_filterh x7, x6 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10 @@ -3860,7 +3905,7 @@ endfunc function hevc_put_hevc_epel_bi_hv12_8_end_neon load_epel_filterh x7, x6 sub x1, x1, #8 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -3885,7 +3930,7 @@ endfunc function hevc_put_hevc_epel_bi_hv16_8_end_neon load_epel_filterh x7, x6 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10 @@ -3910,7 +3955,7 @@ endfunc function hevc_put_hevc_epel_bi_hv24_8_end_neon load_epel_filterh x7, x6 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 @@ -3939,7 +3984,7 @@ endfunc function hevc_put_hevc_epel_bi_hv32_8_end_neon load_epel_filterh x7, x6 - mov x10, #(MAX_PB_SIZE * 2) + mov x10, #(HEVC_MAX_PB_SIZE * 2) ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10 diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S index a05009c9d6..47b3948f8b 100644 --- a/libavcodec/aarch64/h26x/qpel_neon.S +++ b/libavcodec/aarch64/h26x/qpel_neon.S @@ -1250,6 +1250,10 @@ function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1 b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon) endfunc +function ff_vvc_put_pel_uni_pixels4_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_pixels4_8_neon) +endfunc + function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1 1: ldr s0, [x2] @@ -1278,6 +1282,10 @@ function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_uni_pixels8_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_pixels8_8_neon) +endfunc + function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1 1: ldr d0, [x2] @@ -1306,6 +1314,10 @@ function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_uni_pixels16_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_pixels16_8_neon) +endfunc + function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1 1: ldr q0, [x2] @@ -1328,6 +1340,10 @@ function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_uni_pixels32_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_pixels32_8_neon) +endfunc + function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1 1: ld1 {v0.16b, v1.16b}, [x2], x3 @@ -1346,6 +1362,10 @@ function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_uni_pixels64_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_pixels64_8_neon) +endfunc + function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1 1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 @@ -1355,6 +1375,19 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_uni_pixels128_8_neon, export=1 + sub x1, x1, #64 + sub x3, x3, #64 +1: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 + subs w4, w4, #1 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1 + b.ne 1b + ret +endfunc + function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1 load_qpel_filterb x6, x5 sub x2, x2, x3, lsl #1 @@ -1528,6 +1561,10 @@ function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1 b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) endfunc +function ff_vvc_put_pel_uni_w_pixels4_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon) +endfunc + function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 @@ -1598,6 +1635,10 @@ function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_uni_w_pixels8_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon) +endfunc + function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1 mov w10, #-6 sub w10, w10, w5 @@ -1741,7 +1782,9 @@ function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1 ret endfunc - +function ff_vvc_put_pel_uni_w_pixels16_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon) +endfunc function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1 mov w10, #-6 @@ -1803,6 +1846,9 @@ function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_uni_w_pixels32_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon) +endfunc function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1 mov w10, #-6 @@ -1839,6 +1885,39 @@ function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1 ret endfunc +function ff_vvc_put_pel_uni_w_pixels64_8_neon, export=1 + b X(ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon) +endfunc + +function ff_vvc_put_pel_uni_w_pixels128_8_neon, export=1 + mov w10, #-6 + sub w10, w10, w5 + dup v30.8h, w6 + dup v31.4s, w10 + dup v29.4s, w7 + sub x1, x1, #64 + sub x3, x3, #64 +1: + mov x11, x2 + mov x12, x0 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 + PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19 + PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23 + PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19 + PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 + + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 + subs w4, w4, #1 + PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19 + PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23 + PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19 + PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + b.ne 1b + ret +endfunc + .macro QPEL_UNI_W_V_HEADER ldur x12, [sp, #8] // my sub x2, x2, x3, lsl #1 diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile index a5ad24dfc5..a1c1f03e27 100644 --- a/libavcodec/aarch64/vvc/Makefile +++ b/libavcodec/aarch64/vvc/Makefile @@ -3,5 +3,6 @@ clean:: OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/dsp_init.o NEON-OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/alf.o \ + aarch64/h26x/epel_neon.o \ aarch64/h26x/qpel_neon.o \ aarch64/h26x/sao_neon.o diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index ea6245d9a3..457be8c725 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -46,6 +46,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) return; if (bd == 8) { + c->inter.put[0][1][0][0] = ff_vvc_put_pel_pixels4_8_neon; + c->inter.put[0][2][0][0] = ff_vvc_put_pel_pixels8_8_neon; + c->inter.put[0][3][0][0] = ff_vvc_put_pel_pixels16_8_neon; + c->inter.put[0][4][0][0] = ff_vvc_put_pel_pixels32_8_neon; + c->inter.put[0][5][0][0] = ff_vvc_put_pel_pixels64_8_neon; + c->inter.put[0][6][0][0] = ff_vvc_put_pel_pixels128_8_neon; + c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon; c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon; c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon; @@ -53,6 +60,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[0][5][0][1] = c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon; + c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon; + c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon; + c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon; + c->inter.put_uni[0][4][0][0] = ff_vvc_put_pel_uni_pixels32_8_neon; + c->inter.put_uni[0][5][0][0] = ff_vvc_put_pel_uni_pixels64_8_neon; + c->inter.put_uni[0][6][0][0] = ff_vvc_put_pel_uni_pixels128_8_neon; + c->inter.put_uni[0][1][0][1] = ff_vvc_put_qpel_uni_h4_8_neon; c->inter.put_uni[0][2][0][1] = ff_vvc_put_qpel_uni_h8_8_neon; c->inter.put_uni[0][3][0][1] = ff_vvc_put_qpel_uni_h16_8_neon; @@ -60,6 +74,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put_uni[0][5][0][1] = c->inter.put_uni[0][6][0][1] = ff_vvc_put_qpel_uni_h32_8_neon; + c->inter.put_uni_w[0][1][0][0] = ff_vvc_put_pel_uni_w_pixels4_8_neon; + c->inter.put_uni_w[0][2][0][0] = ff_vvc_put_pel_uni_w_pixels8_8_neon; + c->inter.put_uni_w[0][3][0][0] = ff_vvc_put_pel_uni_w_pixels16_8_neon; + c->inter.put_uni_w[0][4][0][0] = ff_vvc_put_pel_uni_w_pixels32_8_neon; + c->inter.put_uni_w[0][5][0][0] = ff_vvc_put_pel_uni_w_pixels64_8_neon; + c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon; + for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++) c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon; c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;