aarch64/vvc: Add put_pel/put_pel_uni/put_pel_uni_w

put_luma_pixels_8_4x4_c:                                 0.2 ( 1.00x)
put_luma_pixels_8_4x4_neon:                              0.2 ( 1.00x)
put_luma_pixels_8_8x8_c:                                 0.7 ( 1.00x)
put_luma_pixels_8_8x8_neon:                              0.2 ( 3.22x)
put_luma_pixels_8_16x16_c:                               2.2 ( 1.00x)
put_luma_pixels_8_16x16_neon:                            0.2 ( 9.89x)
put_luma_pixels_8_32x32_c:                               8.2 ( 1.00x)
put_luma_pixels_8_32x32_neon:                            1.2 ( 6.71x)
put_luma_pixels_8_64x64_c:                              33.7 ( 1.00x)
put_luma_pixels_8_64x64_neon:                            2.5 (13.63x)
put_luma_pixels_8_128x128_c:                           145.5 ( 1.00x)
put_luma_pixels_8_128x128_neon:                         10.2 (14.23x)
put_uni_pixels_luma_8_4x4_c:                             0.5 ( 1.00x)
put_uni_pixels_luma_8_4x4_neon:                          0.0 ( 0.00x)
put_uni_pixels_luma_8_8x8_c:                             0.5 ( 1.00x)
put_uni_pixels_luma_8_8x8_neon:                          0.2 ( 2.11x)
put_uni_pixels_luma_8_16x16_c:                           1.2 ( 1.00x)
put_uni_pixels_luma_8_16x16_neon:                        0.2 ( 5.44x)
put_uni_pixels_luma_8_32x32_c:                           3.0 ( 1.00x)
put_uni_pixels_luma_8_32x32_neon:                        0.5 ( 6.26x)
put_uni_pixels_luma_8_64x64_c:                           3.0 ( 1.00x)
put_uni_pixels_luma_8_64x64_neon:                        1.7 ( 1.72x)
put_uni_pixels_luma_8_128x128_c:                         6.5 ( 1.00x)
put_uni_pixels_luma_8_128x128_neon:                      6.5 ( 1.00x)
release/7.1
Zhao Zhili 4 months ago committed by Nuo Mi
parent 20f2bf5530
commit 25448d1716
  1. 22
      libavcodec/aarch64/h26x/dsp.h
  2. 189
      libavcodec/aarch64/h26x/epel_neon.S
  3. 81
      libavcodec/aarch64/h26x/qpel_neon.S
  4. 1
      libavcodec/aarch64/vvc/Makefile
  5. 21
      libavcodec/aarch64/vvc/dsp_init.c

@ -248,4 +248,26 @@ NEON8_FNPROTO_PARTIAL_4(qpel, (int16_t *dst, const uint8_t *_src, ptrdiff_t _src
NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width),) ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width),)
#undef NEON8_FNPROTO_PARTIAL_6
#define NEON8_FNPROTO_PARTIAL_6(fn, args, ext) \
void ff_vvc_put_##fn##4_8_neon##ext args; \
void ff_vvc_put_##fn##8_8_neon##ext args; \
void ff_vvc_put_##fn##16_8_neon##ext args; \
void ff_vvc_put_##fn##32_8_neon##ext args; \
void ff_vvc_put_##fn##64_8_neon##ext args; \
void ff_vvc_put_##fn##128_8_neon##ext args
NEON8_FNPROTO_PARTIAL_6(pel_pixels, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride, int height,
const int8_t *hf, const int8_t *vf, int width),);
NEON8_FNPROTO_PARTIAL_6(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, int height,
const int8_t *hf, const int8_t *vf, int width),);
NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
const int8_t *hf, const int8_t *vf, int width),);
#endif #endif

@ -19,7 +19,8 @@
*/ */
#include "libavutil/aarch64/asm.S" #include "libavutil/aarch64/asm.S"
#define MAX_PB_SIZE 64 #define HEVC_MAX_PB_SIZE 64
#define VVC_MAX_PB_SIZE 128
const epel_filters, align=4 const epel_filters, align=4
.byte 0, 0, 0, 0 .byte 0, 0, 0, 0
@ -131,8 +132,13 @@ endconst
b.ne 1b b.ne 1b
.endm .endm
function ff_vvc_put_pel_pixels4_8_neon, export=1
mov x7, #(VVC_MAX_PB_SIZE * 2)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1 function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2) mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.s}[0], [x1], x2 1: ld1 {v0.s}[0], [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
subs w3, w3, #1 subs w3, w3, #1
@ -142,7 +148,7 @@ function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1 function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2 - 8) mov x7, #(HEVC_MAX_PB_SIZE * 2 - 8)
1: ld1 {v0.8b}, [x1], x2 1: ld1 {v0.8b}, [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
st1 {v4.d}[0], [x0], #8 st1 {v4.d}[0], [x0], #8
@ -152,8 +158,13 @@ function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_pixels8_8_neon, export=1
mov x7, #(VVC_MAX_PB_SIZE * 2)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1 function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2) mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b}, [x1], x2 1: ld1 {v0.8b}, [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
subs w3, w3, #1 subs w3, w3, #1
@ -163,7 +174,7 @@ function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1 function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2 - 16) mov x7, #(HEVC_MAX_PB_SIZE * 2 - 16)
1: ld1 {v0.8b, v1.8b}, [x1], x2 1: ld1 {v0.8b, v1.8b}, [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
st1 {v4.8h}, [x0], #16 st1 {v4.8h}, [x0], #16
@ -174,8 +185,13 @@ function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_pixels16_8_neon, export=1
mov x7, #(VVC_MAX_PB_SIZE * 2)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1 function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2) mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b, v1.8b}, [x1], x2 1: ld1 {v0.8b, v1.8b}, [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
ushll v5.8h, v1.8b, #6 ushll v5.8h, v1.8b, #6
@ -186,7 +202,7 @@ function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1 function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2) mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b-v2.8b}, [x1], x2 1: ld1 {v0.8b-v2.8b}, [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
ushll v5.8h, v1.8b, #6 ushll v5.8h, v1.8b, #6
@ -197,8 +213,13 @@ function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_pixels32_8_neon, export=1
mov x7, #(VVC_MAX_PB_SIZE * 2)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1 function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
mov x7, #(MAX_PB_SIZE * 2) mov x7, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b-v3.8b}, [x1], x2 1: ld1 {v0.8b-v3.8b}, [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
ushll v5.8h, v1.8b, #6 ushll v5.8h, v1.8b, #6
@ -211,7 +232,7 @@ function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1 function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
mov x7, #(MAX_PB_SIZE) mov x7, #(HEVC_MAX_PB_SIZE)
1: ld1 {v0.16b-v2.16b}, [x1], x2 1: ld1 {v0.16b-v2.16b}, [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
ushll2 v5.8h, v0.16b, #6 ushll2 v5.8h, v0.16b, #6
@ -226,26 +247,50 @@ function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
ret ret
endfunc endfunc
function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1 .macro put_pel_pixels64_8_neon
1: ld1 {v0.16b-v3.16b}, [x1], x2
ushll v4.8h, v0.8b, #6 ushll v4.8h, v0.8b, #6
ushll2 v5.8h, v0.16b, #6 ushll2 v5.8h, v0.16b, #6
ushll v6.8h, v1.8b, #6 ushll v6.8h, v1.8b, #6
ushll2 v7.8h, v1.16b, #6 ushll2 v7.8h, v1.16b, #6
st1 {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE) st1 {v4.8h-v7.8h}, [x0], #64
ushll v16.8h, v2.8b, #6 ushll v16.8h, v2.8b, #6
ushll2 v17.8h, v2.16b, #6 ushll2 v17.8h, v2.16b, #6
ushll v18.8h, v3.8b, #6 ushll v18.8h, v3.8b, #6
ushll2 v19.8h, v3.16b, #6 ushll2 v19.8h, v3.16b, #6
st1 {v16.8h-v19.8h}, [x0], x7
.endm
function ff_vvc_put_pel_pixels64_8_neon, export=1
mov x7, #(2 * VVC_MAX_PB_SIZE - 64)
b 1f
endfunc
function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
mov x7, #(HEVC_MAX_PB_SIZE)
1:
ld1 {v0.16b-v3.16b}, [x1], x2
subs w3, w3, #1 subs w3, w3, #1
st1 {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE) put_pel_pixels64_8_neon
b.ne 1b b.ne 1b
ret ret
endfunc endfunc
function ff_vvc_put_pel_pixels128_8_neon, export=1
mov x7, #64
1:
mov x6, x1
ld1 {v0.16b-v3.16b}, [x6], #64
add x1, x1, x2
subs w3, w3, #1
put_pel_pixels64_8_neon
ld1 {v0.16b-v3.16b}, [x6], #64
put_pel_pixels64_8_neon
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1 function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.s}[0], [x2], x3 // src 1: ld1 {v0.s}[0], [x2], x3 // src
ushll v16.8h, v0.8b, #6 ushll v16.8h, v0.8b, #6
ld1 {v20.4h}, [x4], x10 // src2 ld1 {v20.4h}, [x4], x10 // src2
@ -258,7 +303,7 @@ function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1 function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
sub x1, x1, #4 sub x1, x1, #4
1: ld1 {v0.8b}, [x2], x3 1: ld1 {v0.8b}, [x2], x3
ushll v16.8h, v0.8b, #6 ushll v16.8h, v0.8b, #6
@ -273,7 +318,7 @@ function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1 function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b}, [x2], x3 // src 1: ld1 {v0.8b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6 ushll v16.8h, v0.8b, #6
ld1 {v20.8h}, [x4], x10 // src2 ld1 {v20.8h}, [x4], x10 // src2
@ -286,7 +331,7 @@ function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1 function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
sub x1, x1, #8 sub x1, x1, #8
1: ld1 {v0.16b}, [x2], x3 1: ld1 {v0.16b}, [x2], x3
ushll v16.8h, v0.8b, #6 ushll v16.8h, v0.8b, #6
@ -304,7 +349,7 @@ function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1 function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.16b}, [x2], x3 // src 1: ld1 {v0.16b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6 ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6 ushll2 v17.8h, v0.16b, #6
@ -320,7 +365,7 @@ function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1 function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.8b-v2.8b}, [x2], x3 // src 1: ld1 {v0.8b-v2.8b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6 ushll v16.8h, v0.8b, #6
ushll v17.8h, v1.8b, #6 ushll v17.8h, v1.8b, #6
@ -339,7 +384,7 @@ function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1 function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v0.16b-v1.16b}, [x2], x3 // src 1: ld1 {v0.16b-v1.16b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6 ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6 ushll2 v17.8h, v0.16b, #6
@ -361,7 +406,7 @@ function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
endfunc endfunc
function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1 function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
mov x10, #(MAX_PB_SIZE) mov x10, #(HEVC_MAX_PB_SIZE)
1: ld1 {v0.16b-v2.16b}, [x2], x3 // src 1: ld1 {v0.16b-v2.16b}, [x2], x3 // src
ushll v16.8h, v0.8b, #6 ushll v16.8h, v0.8b, #6
ushll2 v17.8h, v0.16b, #6 ushll2 v17.8h, v0.16b, #6
@ -369,7 +414,7 @@ function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
ushll2 v19.8h, v1.16b, #6 ushll2 v19.8h, v1.16b, #6
ushll v20.8h, v2.8b, #6 ushll v20.8h, v2.8b, #6
ushll2 v21.8h, v2.16b, #6 ushll2 v21.8h, v2.16b, #6
ld1 {v24.8h-v27.8h}, [x4], #(MAX_PB_SIZE) // src2 ld1 {v24.8h-v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) // src2
sqadd v16.8h, v16.8h, v24.8h sqadd v16.8h, v16.8h, v24.8h
sqadd v17.8h, v17.8h, v25.8h sqadd v17.8h, v17.8h, v25.8h
sqadd v18.8h, v18.8h, v26.8h sqadd v18.8h, v18.8h, v26.8h
@ -399,12 +444,12 @@ function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
ushll2 v21.8h, v2.16b, #6 ushll2 v21.8h, v2.16b, #6
ushll v22.8h, v3.8b, #6 ushll v22.8h, v3.8b, #6
ushll2 v23.8h, v3.16b, #6 ushll2 v23.8h, v3.16b, #6
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) // src2 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(HEVC_MAX_PB_SIZE) // src2
sqadd v16.8h, v16.8h, v24.8h sqadd v16.8h, v16.8h, v24.8h
sqadd v17.8h, v17.8h, v25.8h sqadd v17.8h, v17.8h, v25.8h
sqadd v18.8h, v18.8h, v26.8h sqadd v18.8h, v18.8h, v26.8h
sqadd v19.8h, v19.8h, v27.8h sqadd v19.8h, v19.8h, v27.8h
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE) ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(HEVC_MAX_PB_SIZE)
sqadd v20.8h, v20.8h, v24.8h sqadd v20.8h, v20.8h, v24.8h
sqadd v21.8h, v21.8h, v25.8h sqadd v21.8h, v21.8h, v25.8h
sqadd v22.8h, v22.8h, v26.8h sqadd v22.8h, v22.8h, v26.8h
@ -427,7 +472,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1
load_epel_filterb x6, x7 load_epel_filterb x6, x7
sub x2, x2, #1 sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v4.8b}, [x2], x3 1: ld1 {v4.8b}, [x2], x3
ext v5.8b, v4.8b, v4.8b, #1 ext v5.8b, v4.8b, v4.8b, #1
ext v6.8b, v4.8b, v4.8b, #2 ext v6.8b, v4.8b, v4.8b, #2
@ -446,7 +491,7 @@ function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1
load_epel_filterb x6, x7 load_epel_filterb x6, x7
sub w1, w1, #4 sub w1, w1, #4
sub x2, x2, #1 sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v24.16b}, [x2], x3 1: ld1 {v24.16b}, [x2], x3
ext v26.16b, v24.16b, v24.16b, #1 ext v26.16b, v24.16b, v24.16b, #1
ext v27.16b, v24.16b, v24.16b, #2 ext v27.16b, v24.16b, v24.16b, #2
@ -465,7 +510,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1
load_epel_filterb x6, x7 load_epel_filterb x6, x7
sub x2, x2, #1 sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v24.16b}, [x2], x3 1: ld1 {v24.16b}, [x2], x3
ext v26.16b, v24.16b, v24.16b, #1 ext v26.16b, v24.16b, v24.16b, #1
ext v27.16b, v24.16b, v24.16b, #2 ext v27.16b, v24.16b, v24.16b, #2
@ -484,7 +529,7 @@ function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1
load_epel_filterb x6, x7 load_epel_filterb x6, x7
sub x1, x1, #8 sub x1, x1, #8
sub x2, x2, #1 sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v24.16b}, [x2], x3 1: ld1 {v24.16b}, [x2], x3
ext v26.16b, v24.16b, v24.16b, #1 ext v26.16b, v24.16b, v24.16b, #1
ext v27.16b, v24.16b, v24.16b, #2 ext v27.16b, v24.16b, v24.16b, #2
@ -506,7 +551,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1
load_epel_filterb x6, x7 load_epel_filterb x6, x7
sub x2, x2, #1 sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ldr q24, [x2] 1: ldr q24, [x2]
ldr s25, [x2, #16] ldr s25, [x2, #16]
add x2, x2, x3 add x2, x2, x3
@ -529,7 +574,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1
load_epel_filterb x6, x7 load_epel_filterb x6, x7
sub x2, x2, #1 sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ld1 {v24.16b, v25.16b}, [x2], x3 1: ld1 {v24.16b, v25.16b}, [x2], x3
ext v26.16b, v24.16b, v25.16b, #1 ext v26.16b, v24.16b, v25.16b, #1
ext v27.16b, v24.16b, v25.16b, #2 ext v27.16b, v24.16b, v25.16b, #2
@ -556,7 +601,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1
load_epel_filterb x6, x7 load_epel_filterb x6, x7
sub x2, x2, #1 sub x2, x2, #1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
1: ldp q24, q25, [x2] 1: ldp q24, q25, [x2]
ldr s26, [x2, #32] ldr s26, [x2, #32]
add x2, x2, x3 add x2, x2, x3
@ -589,7 +634,7 @@ function ff_hevc_put_hevc_epel_bi_h48_8_neon, export=1
load_epel_filterb x6, x7 load_epel_filterb x6, x7
sub x2, x2, #1 sub x2, x2, #1
mov x7, #24 mov x7, #24
mov x10, #(MAX_PB_SIZE * 2 - 48) mov x10, #(HEVC_MAX_PB_SIZE * 2 - 48)
1: ld1 {v24.16b, v25.16b, v26.16b}, [x2] 1: ld1 {v24.16b, v25.16b, v26.16b}, [x2]
ldr s27, [x2, #48] ldr s27, [x2, #48]
add x2, x2, x3 add x2, x2, x3
@ -683,7 +728,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1
load_epel_filterb x7, x6 load_epel_filterb x7, x6
sub x2, x2, x3 sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.s}[0], [x2], x3 ld1 {v16.s}[0], [x2], x3
ld1 {v17.s}[0], [x2], x3 ld1 {v17.s}[0], [x2], x3
ld1 {v18.s}[0], [x2], x3 ld1 {v18.s}[0], [x2], x3
@ -705,7 +750,7 @@ function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1
load_epel_filterb x7, x6 load_epel_filterb x7, x6
sub x2, x2, x3 sub x2, x2, x3
sub x1, x1, #4 sub x1, x1, #4
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8b}, [x2], x3 ld1 {v16.8b}, [x2], x3
ld1 {v17.8b}, [x2], x3 ld1 {v17.8b}, [x2], x3
ld1 {v18.8b}, [x2], x3 ld1 {v18.8b}, [x2], x3
@ -727,7 +772,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1
load_epel_filterb x7, x6 load_epel_filterb x7, x6
sub x2, x2, x3 sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8b}, [x2], x3 ld1 {v16.8b}, [x2], x3
ld1 {v17.8b}, [x2], x3 ld1 {v17.8b}, [x2], x3
ld1 {v18.8b}, [x2], x3 ld1 {v18.8b}, [x2], x3
@ -749,7 +794,7 @@ function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1
load_epel_filterb x7, x6 load_epel_filterb x7, x6
sub x1, x1, #8 sub x1, x1, #8
sub x2, x2, x3 sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.16b}, [x2], x3 ld1 {v16.16b}, [x2], x3
ld1 {v17.16b}, [x2], x3 ld1 {v17.16b}, [x2], x3
ld1 {v18.16b}, [x2], x3 ld1 {v18.16b}, [x2], x3
@ -774,7 +819,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1
load_epel_filterb x7, x6 load_epel_filterb x7, x6
sub x2, x2, x3 sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.16b}, [x2], x3 ld1 {v16.16b}, [x2], x3
ld1 {v17.16b}, [x2], x3 ld1 {v17.16b}, [x2], x3
ld1 {v18.16b}, [x2], x3 ld1 {v18.16b}, [x2], x3
@ -798,7 +843,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1
load_epel_filterb x7, x6 load_epel_filterb x7, x6
sub x2, x2, x3 sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8b, v17.8b, v18.8b}, [x2], x3 ld1 {v16.8b, v17.8b, v18.8b}, [x2], x3
ld1 {v19.8b, v20.8b, v21.8b}, [x2], x3 ld1 {v19.8b, v20.8b, v21.8b}, [x2], x3
ld1 {v22.8b, v23.8b, v24.8b}, [x2], x3 ld1 {v22.8b, v23.8b, v24.8b}, [x2], x3
@ -825,7 +870,7 @@ endfunc
function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1 function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1
load_epel_filterb x7, x6 load_epel_filterb x7, x6
sub x2, x2, x3 sub x2, x2, x3
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.16b, v17.16b}, [x2], x3 ld1 {v16.16b, v17.16b}, [x2], x3
ld1 {v18.16b, v19.16b}, [x2], x3 ld1 {v18.16b, v19.16b}, [x2], x3
ld1 {v20.16b, v21.16b}, [x2], x3 ld1 {v20.16b, v21.16b}, [x2], x3
@ -895,7 +940,7 @@ endfunc
function ff_hevc_put_hevc_epel_v4_8_neon, export=1 function ff_hevc_put_hevc_epel_v4_8_neon, export=1
load_epel_filterb x5, x4 load_epel_filterb x5, x4
sub x1, x1, x2 sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr s16, [x1] ldr s16, [x1]
ldr s17, [x1, x2] ldr s17, [x1, x2]
add x1, x1, x2, lsl #1 add x1, x1, x2, lsl #1
@ -915,7 +960,7 @@ endfunc
function ff_hevc_put_hevc_epel_v6_8_neon, export=1 function ff_hevc_put_hevc_epel_v6_8_neon, export=1
load_epel_filterb x5, x4 load_epel_filterb x5, x4
sub x1, x1, x2 sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2 - 8) mov x10, #(HEVC_MAX_PB_SIZE * 2 - 8)
ldr d16, [x1] ldr d16, [x1]
ldr d17, [x1, x2] ldr d17, [x1, x2]
add x1, x1, x2, lsl #1 add x1, x1, x2, lsl #1
@ -936,7 +981,7 @@ endfunc
function ff_hevc_put_hevc_epel_v8_8_neon, export=1 function ff_hevc_put_hevc_epel_v8_8_neon, export=1
load_epel_filterb x5, x4 load_epel_filterb x5, x4
sub x1, x1, x2 sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr d16, [x1] ldr d16, [x1]
ldr d17, [x1, x2] ldr d17, [x1, x2]
add x1, x1, x2, lsl #1 add x1, x1, x2, lsl #1
@ -956,7 +1001,7 @@ endfunc
function ff_hevc_put_hevc_epel_v12_8_neon, export=1 function ff_hevc_put_hevc_epel_v12_8_neon, export=1
load_epel_filterb x5, x4 load_epel_filterb x5, x4
sub x1, x1, x2 sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr q16, [x1] ldr q16, [x1]
ldr q17, [x1, x2] ldr q17, [x1, x2]
add x1, x1, x2, lsl #1 add x1, x1, x2, lsl #1
@ -980,7 +1025,7 @@ endfunc
function ff_hevc_put_hevc_epel_v16_8_neon, export=1 function ff_hevc_put_hevc_epel_v16_8_neon, export=1
load_epel_filterb x5, x4 load_epel_filterb x5, x4
sub x1, x1, x2 sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr q16, [x1] ldr q16, [x1]
ldr q17, [x1, x2] ldr q17, [x1, x2]
add x1, x1, x2, lsl #1 add x1, x1, x2, lsl #1
@ -1002,7 +1047,7 @@ endfunc
function ff_hevc_put_hevc_epel_v24_8_neon, export=1 function ff_hevc_put_hevc_epel_v24_8_neon, export=1
load_epel_filterb x5, x4 load_epel_filterb x5, x4
sub x1, x1, x2 sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2 ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2 ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2 ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
@ -1025,7 +1070,7 @@ endfunc
function ff_hevc_put_hevc_epel_v32_8_neon, export=1 function ff_hevc_put_hevc_epel_v32_8_neon, export=1
load_epel_filterb x5, x4 load_epel_filterb x5, x4
sub x1, x1, x2 sub x1, x1, x2
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.16b, v17.16b}, [x1], x2 ld1 {v16.16b, v17.16b}, [x1], x2
ld1 {v18.16b, v19.16b}, [x1], x2 ld1 {v18.16b, v19.16b}, [x1], x2
ld1 {v20.16b, v21.16b}, [x1], x2 ld1 {v20.16b, v21.16b}, [x1], x2
@ -1327,7 +1372,7 @@ endfunc
add x5, x5, x4, lsl #2 add x5, x5, x4, lsl #2
ld1r {v30.4s}, [x5] ld1r {v30.4s}, [x5]
sub x1, x1, #1 sub x1, x1, #1
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
.endm .endm
function ff_hevc_put_hevc_epel_h4_8_neon, export=1 function ff_hevc_put_hevc_epel_h4_8_neon, export=1
@ -2179,7 +2224,7 @@ DISABLE_I8MM
function hevc_put_hevc_epel_hv4_8_end_neon function hevc_put_hevc_epel_hv4_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr d16, [sp] ldr d16, [sp]
ldr d17, [sp, x10] ldr d17, [sp, x10]
add sp, sp, x10, lsl #1 add sp, sp, x10, lsl #1
@ -2198,7 +2243,7 @@ endfunc
function hevc_put_hevc_epel_hv6_8_end_neon function hevc_put_hevc_epel_hv6_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x5, #120 mov x5, #120
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr q16, [sp] ldr q16, [sp]
ldr q17, [sp, x10] ldr q17, [sp, x10]
add sp, sp, x10, lsl #1 add sp, sp, x10, lsl #1
@ -2218,7 +2263,7 @@ endfunc
function hevc_put_hevc_epel_hv8_8_end_neon function hevc_put_hevc_epel_hv8_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ldr q16, [sp] ldr q16, [sp]
ldr q17, [sp, x10] ldr q17, [sp, x10]
add sp, sp, x10, lsl #1 add sp, sp, x10, lsl #1
@ -2238,7 +2283,7 @@ endfunc
function hevc_put_hevc_epel_hv12_8_end_neon function hevc_put_hevc_epel_hv12_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x5, #112 mov x5, #112
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -2258,7 +2303,7 @@ endfunc
function hevc_put_hevc_epel_hv16_8_end_neon function hevc_put_hevc_epel_hv16_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -2278,7 +2323,7 @@ endfunc
function hevc_put_hevc_epel_hv24_8_end_neon function hevc_put_hevc_epel_hv24_8_end_neon
load_epel_filterh x5, x4 load_epel_filterh x5, x4
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
@ -2462,7 +2507,7 @@ epel_hv neon
function hevc_put_hevc_epel_uni_hv4_8_end_neon function hevc_put_hevc_epel_uni_hv4_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.4h}, [sp], x10 ld1 {v16.4h}, [sp], x10
ld1 {v17.4h}, [sp], x10 ld1 {v17.4h}, [sp], x10
ld1 {v18.4h}, [sp], x10 ld1 {v18.4h}, [sp], x10
@ -2481,7 +2526,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv6_8_end_neon function hevc_put_hevc_epel_uni_hv6_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
sub x1, x1, #4 sub x1, x1, #4
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10 ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10
@ -2501,7 +2546,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv8_8_end_neon function hevc_put_hevc_epel_uni_hv8_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10 ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10
@ -2521,7 +2566,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv12_8_end_neon function hevc_put_hevc_epel_uni_hv12_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
sub x1, x1, #8 sub x1, x1, #8
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -2543,7 +2588,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv16_8_end_neon function hevc_put_hevc_epel_uni_hv16_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -2565,7 +2610,7 @@ endfunc
function hevc_put_hevc_epel_uni_hv24_8_end_neon function hevc_put_hevc_epel_uni_hv24_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
@ -3223,7 +3268,7 @@ DISABLE_I8MM
function hevc_put_hevc_epel_uni_w_hv4_8_end_neon function hevc_put_hevc_epel_uni_w_hv4_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.4h}, [sp], x10 ld1 {v16.4h}, [sp], x10
ld1 {v17.4h}, [sp], x10 ld1 {v17.4h}, [sp], x10
ld1 {v18.4h}, [sp], x10 ld1 {v18.4h}, [sp], x10
@ -3273,7 +3318,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv6_8_end_neon function hevc_put_hevc_epel_uni_w_hv6_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
sub x1, x1, #4 sub x1, x1, #4
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10 ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10
@ -3326,7 +3371,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv8_8_end_neon function hevc_put_hevc_epel_uni_w_hv8_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10 ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10
@ -3376,7 +3421,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv12_8_end_neon function hevc_put_hevc_epel_uni_w_hv12_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
sub x1, x1, #8 sub x1, x1, #8
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -3437,7 +3482,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv16_8_end_neon function hevc_put_hevc_epel_uni_w_hv16_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -3498,7 +3543,7 @@ endfunc
function hevc_put_hevc_epel_uni_w_hv24_8_end_neon function hevc_put_hevc_epel_uni_w_hv24_8_end_neon
load_epel_filterh x6, x5 load_epel_filterh x6, x5
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
@ -3795,7 +3840,7 @@ epel_uni_w_hv neon
function hevc_put_hevc_epel_bi_hv4_8_end_neon function hevc_put_hevc_epel_bi_hv4_8_end_neon
load_epel_filterh x7, x6 load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.4h}, [sp], x10 ld1 {v16.4h}, [sp], x10
ld1 {v17.4h}, [sp], x10 ld1 {v17.4h}, [sp], x10
ld1 {v18.4h}, [sp], x10 ld1 {v18.4h}, [sp], x10
@ -3816,7 +3861,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv6_8_end_neon function hevc_put_hevc_epel_bi_hv6_8_end_neon
load_epel_filterh x7, x6 load_epel_filterh x7, x6
sub x1, x1, #4 sub x1, x1, #4
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10 ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10
@ -3838,7 +3883,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv8_8_end_neon function hevc_put_hevc_epel_bi_hv8_8_end_neon
load_epel_filterh x7, x6 load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h}, [sp], x10 ld1 {v16.8h}, [sp], x10
ld1 {v17.8h}, [sp], x10 ld1 {v17.8h}, [sp], x10
ld1 {v18.8h}, [sp], x10 ld1 {v18.8h}, [sp], x10
@ -3860,7 +3905,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv12_8_end_neon function hevc_put_hevc_epel_bi_hv12_8_end_neon
load_epel_filterh x7, x6 load_epel_filterh x7, x6
sub x1, x1, #8 sub x1, x1, #8
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -3885,7 +3930,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv16_8_end_neon function hevc_put_hevc_epel_bi_hv16_8_end_neon
load_epel_filterh x7, x6 load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h}, [sp], x10 ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10 ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10 ld1 {v20.8h, v21.8h}, [sp], x10
@ -3910,7 +3955,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv24_8_end_neon function hevc_put_hevc_epel_bi_hv24_8_end_neon
load_epel_filterh x7, x6 load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10 ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10 ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10 ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
@ -3939,7 +3984,7 @@ endfunc
function hevc_put_hevc_epel_bi_hv32_8_end_neon function hevc_put_hevc_epel_bi_hv32_8_end_neon
load_epel_filterh x7, x6 load_epel_filterh x7, x6
mov x10, #(MAX_PB_SIZE * 2) mov x10, #(HEVC_MAX_PB_SIZE * 2)
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10

@ -1250,6 +1250,10 @@ function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon) b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
endfunc endfunc
function ff_vvc_put_pel_uni_pixels4_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels4_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
1: 1:
ldr s0, [x2] ldr s0, [x2]
@ -1278,6 +1282,10 @@ function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_pixels8_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels8_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
1: 1:
ldr d0, [x2] ldr d0, [x2]
@ -1306,6 +1314,10 @@ function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_pixels16_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels16_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
1: 1:
ldr q0, [x2] ldr q0, [x2]
@ -1328,6 +1340,10 @@ function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_pixels32_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels32_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
1: 1:
ld1 {v0.16b, v1.16b}, [x2], x3 ld1 {v0.16b, v1.16b}, [x2], x3
@ -1346,6 +1362,10 @@ function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_pixels64_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_pixels64_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
1: 1:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
@ -1355,6 +1375,19 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_pixels128_8_neon, export=1
sub x1, x1, #64
sub x3, x3, #64
1:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
subs w4, w4, #1
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
b.ne 1b
ret
endfunc
function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1 function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
load_qpel_filterb x6, x5 load_qpel_filterb x6, x5
sub x2, x2, x3, lsl #1 sub x2, x2, x3, lsl #1
@ -1528,6 +1561,10 @@ function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc endfunc
function ff_vvc_put_pel_uni_w_pixels4_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
mov w10, #-6 mov w10, #-6
sub w10, w10, w5 sub w10, w10, w5
@ -1598,6 +1635,10 @@ function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_w_pixels8_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
mov w10, #-6 mov w10, #-6
sub w10, w10, w5 sub w10, w10, w5
@ -1741,7 +1782,9 @@ function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_w_pixels16_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
mov w10, #-6 mov w10, #-6
@ -1803,6 +1846,9 @@ function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_w_pixels32_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1 function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
mov w10, #-6 mov w10, #-6
@ -1839,6 +1885,39 @@ function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
ret ret
endfunc endfunc
function ff_vvc_put_pel_uni_w_pixels64_8_neon, export=1
b X(ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon)
endfunc
function ff_vvc_put_pel_uni_w_pixels128_8_neon, export=1
mov w10, #-6
sub w10, w10, w5
dup v30.8h, w6
dup v31.4s, w10
dup v29.4s, w7
sub x1, x1, #64
sub x3, x3, #64
1:
mov x11, x2
mov x12, x0
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
subs w4, w4, #1
PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
b.ne 1b
ret
endfunc
.macro QPEL_UNI_W_V_HEADER .macro QPEL_UNI_W_V_HEADER
ldur x12, [sp, #8] // my ldur x12, [sp, #8] // my
sub x2, x2, x3, lsl #1 sub x2, x2, x3, lsl #1

@ -3,5 +3,6 @@ clean::
OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/dsp_init.o OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/dsp_init.o
NEON-OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/alf.o \ NEON-OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/alf.o \
aarch64/h26x/epel_neon.o \
aarch64/h26x/qpel_neon.o \ aarch64/h26x/qpel_neon.o \
aarch64/h26x/sao_neon.o aarch64/h26x/sao_neon.o

@ -46,6 +46,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
return; return;
if (bd == 8) { if (bd == 8) {
c->inter.put[0][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
c->inter.put[0][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
c->inter.put[0][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
c->inter.put[0][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
c->inter.put[0][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
c->inter.put[0][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon; c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon;
c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon; c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon;
c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon; c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon;
@ -53,6 +60,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[0][5][0][1] = c->inter.put[0][5][0][1] =
c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon; c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon;
c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
c->inter.put_uni[0][4][0][0] = ff_vvc_put_pel_uni_pixels32_8_neon;
c->inter.put_uni[0][5][0][0] = ff_vvc_put_pel_uni_pixels64_8_neon;
c->inter.put_uni[0][6][0][0] = ff_vvc_put_pel_uni_pixels128_8_neon;
c->inter.put_uni[0][1][0][1] = ff_vvc_put_qpel_uni_h4_8_neon; c->inter.put_uni[0][1][0][1] = ff_vvc_put_qpel_uni_h4_8_neon;
c->inter.put_uni[0][2][0][1] = ff_vvc_put_qpel_uni_h8_8_neon; c->inter.put_uni[0][2][0][1] = ff_vvc_put_qpel_uni_h8_8_neon;
c->inter.put_uni[0][3][0][1] = ff_vvc_put_qpel_uni_h16_8_neon; c->inter.put_uni[0][3][0][1] = ff_vvc_put_qpel_uni_h16_8_neon;
@ -60,6 +74,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put_uni[0][5][0][1] = c->inter.put_uni[0][5][0][1] =
c->inter.put_uni[0][6][0][1] = ff_vvc_put_qpel_uni_h32_8_neon; c->inter.put_uni[0][6][0][1] = ff_vvc_put_qpel_uni_h32_8_neon;
c->inter.put_uni_w[0][1][0][0] = ff_vvc_put_pel_uni_w_pixels4_8_neon;
c->inter.put_uni_w[0][2][0][0] = ff_vvc_put_pel_uni_w_pixels8_8_neon;
c->inter.put_uni_w[0][3][0][0] = ff_vvc_put_pel_uni_w_pixels16_8_neon;
c->inter.put_uni_w[0][4][0][0] = ff_vvc_put_pel_uni_w_pixels32_8_neon;
c->inter.put_uni_w[0][5][0][0] = ff_vvc_put_pel_uni_w_pixels64_8_neon;
c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++) for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon; c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon; c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;

Loading…
Cancel
Save