mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
620 lines
21 KiB
620 lines
21 KiB
/* |
|
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "libavutil/aarch64/asm.S" |
|
|
|
#define VVC_MAX_PB_SIZE 128 |
|
|
|
.macro vvc_avg type, bit_depth |
|
|
|
.macro vvc_\type\()_\bit_depth\()_2_4 tap |
|
.if \tap == 2 |
|
ldr s0, [src0] |
|
ldr s2, [src1] |
|
.else |
|
ldr d0, [src0] |
|
ldr d2, [src1] |
|
.endif |
|
|
|
.ifc \type, avg |
|
saddl v4.4s, v0.4h, v2.4h |
|
add v4.4s, v4.4s, v16.4s |
|
sqshrn v4.4h, v4.4s, #(15 - \bit_depth) |
|
.else |
|
mov v4.16b, v16.16b |
|
smlal v4.4s, v0.4h, v19.4h |
|
smlal v4.4s, v2.4h, v20.4h |
|
sqshl v4.4s, v4.4s, v22.4s |
|
sqxtn v4.4h, v4.4s |
|
.endif |
|
|
|
.if \bit_depth == 8 |
|
sqxtun v4.8b, v4.8h |
|
.if \tap == 2 |
|
str h4, [dst] |
|
.else // tap == 4 |
|
str s4, [dst] |
|
.endif |
|
|
|
.else // bit_depth > 8 |
|
smin v4.4h, v4.4h, v17.4h |
|
smax v4.4h, v4.4h, v18.4h |
|
.if \tap == 2 |
|
str s4, [dst] |
|
.else |
|
str d4, [dst] |
|
.endif |
|
.endif |
|
add src0, src0, x10 |
|
add src1, src1, x10 |
|
add dst, dst, dst_stride |
|
.endm |
|
|
|
function ff_vvc_\type\()_\bit_depth\()_neon, export=1 |
|
dst .req x0 |
|
dst_stride .req x1 |
|
src0 .req x2 |
|
src1 .req x3 |
|
width .req w4 |
|
height .req w5 |
|
|
|
mov x10, #(VVC_MAX_PB_SIZE * 2) |
|
cmp width, #8 |
|
.ifc \type, avg |
|
movi v16.4s, #(1 << (14 - \bit_depth)) |
|
.else |
|
lsr x11, x6, #32 // weight0 |
|
mov w12, w6 // weight1 |
|
lsr x13, x7, #32 // offset |
|
mov w14, w7 // shift |
|
|
|
dup v19.8h, w11 |
|
neg w14, w14 // so we can use sqshl |
|
dup v20.8h, w12 |
|
dup v16.4s, w13 |
|
dup v22.4s, w14 |
|
.endif // avg |
|
|
|
.if \bit_depth >= 10 |
|
// clip pixel |
|
mov w6, #((1 << \bit_depth) - 1) |
|
movi v18.8h, #0 |
|
dup v17.8h, w6 |
|
.endif |
|
|
|
b.eq 8f |
|
b.hi 16f |
|
cmp width, #4 |
|
b.eq 4f |
|
2: // width == 2 |
|
subs height, height, #1 |
|
vvc_\type\()_\bit_depth\()_2_4 2 |
|
b.ne 2b |
|
b 32f |
|
4: // width == 4 |
|
subs height, height, #1 |
|
vvc_\type\()_\bit_depth\()_2_4 4 |
|
b.ne 4b |
|
b 32f |
|
8: // width == 8 |
|
ld1 {v0.8h}, [src0], x10 |
|
ld1 {v2.8h}, [src1], x10 |
|
.ifc \type, avg |
|
saddl v4.4s, v0.4h, v2.4h |
|
saddl2 v5.4s, v0.8h, v2.8h |
|
add v4.4s, v4.4s, v16.4s |
|
add v5.4s, v5.4s, v16.4s |
|
sqshrn v4.4h, v4.4s, #(15 - \bit_depth) |
|
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth) |
|
.else |
|
mov v4.16b, v16.16b |
|
mov v5.16b, v16.16b |
|
smlal v4.4s, v0.4h, v19.4h |
|
smlal v4.4s, v2.4h, v20.4h |
|
smlal2 v5.4s, v0.8h, v19.8h |
|
smlal2 v5.4s, v2.8h, v20.8h |
|
sqshl v4.4s, v4.4s, v22.4s |
|
sqshl v5.4s, v5.4s, v22.4s |
|
sqxtn v4.4h, v4.4s |
|
sqxtn2 v4.8h, v5.4s |
|
.endif |
|
subs height, height, #1 |
|
.if \bit_depth == 8 |
|
sqxtun v4.8b, v4.8h |
|
st1 {v4.8b}, [dst], dst_stride |
|
.else |
|
smin v4.8h, v4.8h, v17.8h |
|
smax v4.8h, v4.8h, v18.8h |
|
st1 {v4.8h}, [dst], dst_stride |
|
.endif |
|
b.ne 8b |
|
b 32f |
|
16: // width >= 16 |
|
mov w6, width |
|
mov x7, src0 |
|
mov x8, src1 |
|
mov x9, dst |
|
17: |
|
ldp q0, q1, [x7], #32 |
|
ldp q2, q3, [x8], #32 |
|
.ifc \type, avg |
|
saddl v4.4s, v0.4h, v2.4h |
|
saddl2 v5.4s, v0.8h, v2.8h |
|
saddl v6.4s, v1.4h, v3.4h |
|
saddl2 v7.4s, v1.8h, v3.8h |
|
add v4.4s, v4.4s, v16.4s |
|
add v5.4s, v5.4s, v16.4s |
|
add v6.4s, v6.4s, v16.4s |
|
add v7.4s, v7.4s, v16.4s |
|
sqshrn v4.4h, v4.4s, #(15 - \bit_depth) |
|
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth) |
|
sqshrn v6.4h, v6.4s, #(15 - \bit_depth) |
|
sqshrn2 v6.8h, v7.4s, #(15 - \bit_depth) |
|
.else // avg |
|
mov v4.16b, v16.16b |
|
mov v5.16b, v16.16b |
|
mov v6.16b, v16.16b |
|
mov v7.16b, v16.16b |
|
smlal v4.4s, v0.4h, v19.4h |
|
smlal v4.4s, v2.4h, v20.4h |
|
smlal2 v5.4s, v0.8h, v19.8h |
|
smlal2 v5.4s, v2.8h, v20.8h |
|
smlal v6.4s, v1.4h, v19.4h |
|
smlal v6.4s, v3.4h, v20.4h |
|
smlal2 v7.4s, v1.8h, v19.8h |
|
smlal2 v7.4s, v3.8h, v20.8h |
|
sqshl v4.4s, v4.4s, v22.4s |
|
sqshl v5.4s, v5.4s, v22.4s |
|
sqshl v6.4s, v6.4s, v22.4s |
|
sqshl v7.4s, v7.4s, v22.4s |
|
sqxtn v4.4h, v4.4s |
|
sqxtn v6.4h, v6.4s |
|
sqxtn2 v4.8h, v5.4s |
|
sqxtn2 v6.8h, v7.4s |
|
.endif // w_avg |
|
subs w6, w6, #16 |
|
.if \bit_depth == 8 |
|
sqxtun v4.8b, v4.8h |
|
sqxtun2 v4.16b, v6.8h |
|
str q4, [x9], #16 |
|
.else |
|
smin v4.8h, v4.8h, v17.8h |
|
smin v6.8h, v6.8h, v17.8h |
|
smax v4.8h, v4.8h, v18.8h |
|
smax v6.8h, v6.8h, v18.8h |
|
stp q4, q6, [x9], #32 |
|
.endif |
|
b.ne 17b |
|
|
|
subs height, height, #1 |
|
add src0, src0, x10 |
|
add src1, src1, x10 |
|
add dst, dst, dst_stride |
|
b.ne 16b |
|
32: |
|
ret |
|
|
|
.unreq dst |
|
.unreq dst_stride |
|
.unreq src0 |
|
.unreq src1 |
|
.unreq width |
|
.unreq height |
|
endfunc |
|
.endm |
|
|
|
vvc_avg avg, 8 |
|
vvc_avg avg, 10 |
|
vvc_avg avg, 12 |
|
vvc_avg w_avg, 8 |
|
vvc_avg w_avg, 10 |
|
vvc_avg w_avg, 12 |
|
|
|
/* x0: int16_t *dst |
|
* x1: const uint8_t *_src |
|
* x2: ptrdiff_t _src_stride |
|
* w3: int height |
|
* x4: intptr_t mx |
|
* x5: intptr_t my |
|
* w6: int width |
|
*/ |
|
function ff_vvc_dmvr_8_neon, export=1 |
|
dst .req x0 |
|
src .req x1 |
|
src_stride .req x2 |
|
height .req w3 |
|
mx .req x4 |
|
my .req x5 |
|
width .req w6 |
|
|
|
sxtw x6, w6 |
|
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8) |
|
cmp width, #16 |
|
sub src_stride, src_stride, x6 |
|
cset w15, gt // width > 16 |
|
movi v16.8h, #2 // DMVR_SHIFT |
|
sub x7, x7, x6, lsl #1 |
|
1: |
|
cbz w15, 2f |
|
ldr q0, [src], #16 |
|
uxtl v1.8h, v0.8b |
|
uxtl2 v2.8h, v0.16b |
|
ushl v1.8h, v1.8h, v16.8h |
|
ushl v2.8h, v2.8h, v16.8h |
|
stp q1, q2, [dst], #32 |
|
b 3f |
|
2: |
|
ldr d0, [src], #8 |
|
uxtl v1.8h, v0.8b |
|
ushl v1.8h, v1.8h, v16.8h |
|
str q1, [dst], #16 |
|
3: |
|
subs height, height, #1 |
|
ldr s3, [src], #4 |
|
uxtl v4.8h, v3.8b |
|
ushl v4.4h, v4.4h, v16.4h |
|
st1 {v4.4h}, [dst], x7 |
|
|
|
add src, src, src_stride |
|
b.ne 1b |
|
|
|
ret |
|
endfunc |
|
|
|
function ff_vvc_dmvr_12_neon, export=1 |
|
sxtw x6, w6 |
|
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8) |
|
cmp width, #16 |
|
sub src_stride, src_stride, x6, lsl #1 |
|
cset w15, gt // width > 16 |
|
movi v16.8h, #2 // offset4 |
|
sub x7, x7, x6, lsl #1 |
|
1: |
|
cbz w15, 2f |
|
ldp q0, q1, [src], #32 |
|
uaddl v2.4s, v0.4h, v16.4h |
|
uaddl2 v3.4s, v0.8h, v16.8h |
|
uaddl v4.4s, v1.4h, v16.4h |
|
uaddl2 v5.4s, v1.8h, v16.8h |
|
ushr v2.4s, v2.4s, #2 |
|
ushr v3.4s, v3.4s, #2 |
|
ushr v4.4s, v4.4s, #2 |
|
ushr v5.4s, v5.4s, #2 |
|
uqxtn v2.4h, v2.4s |
|
uqxtn2 v2.8h, v3.4s |
|
uqxtn v4.4h, v4.4s |
|
uqxtn2 v4.8h, v5.4s |
|
|
|
stp q2, q4, [dst], #32 |
|
b 3f |
|
2: |
|
ldr q0, [src], #16 |
|
uaddl v2.4s, v0.4h, v16.4h |
|
uaddl2 v3.4s, v0.8h, v16.8h |
|
ushr v2.4s, v2.4s, #2 |
|
ushr v3.4s, v3.4s, #2 |
|
uqxtn v2.4h, v2.4s |
|
uqxtn2 v2.8h, v3.4s |
|
str q2, [dst], #16 |
|
3: |
|
subs height, height, #1 |
|
ldr d0, [src], #8 |
|
uaddl v3.4s, v0.4h, v16.4h |
|
ushr v3.4s, v3.4s, #2 |
|
uqxtn v3.4h, v3.4s |
|
st1 {v3.4h}, [dst], x7 |
|
|
|
add src, src, src_stride |
|
b.ne 1b |
|
|
|
ret |
|
endfunc |
|
|
|
function ff_vvc_dmvr_hv_8_neon, export=1 |
|
tmp0 .req x7 |
|
tmp1 .req x8 |
|
|
|
sub sp, sp, #(VVC_MAX_PB_SIZE * 4) |
|
|
|
movrel x9, X(ff_vvc_inter_luma_dmvr_filters) |
|
add x12, x9, mx, lsl #1 |
|
ldrb w10, [x12] |
|
ldrb w11, [x12, #1] |
|
mov tmp0, sp |
|
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2) |
|
// We know the value are positive |
|
dup v0.8h, w10 // filter_x[0] |
|
dup v1.8h, w11 // filter_x[1] |
|
|
|
add x12, x9, my, lsl #1 |
|
ldrb w10, [x12] |
|
ldrb w11, [x12, #1] |
|
sxtw x6, w6 |
|
movi v30.8h, #(1 << (8 - 7)) // offset1 |
|
movi v31.8h, #8 // offset2 |
|
dup v2.8h, w10 // filter_y[0] |
|
dup v3.8h, w11 // filter_y[1] |
|
|
|
// Valid value for width can only be 8 + 4, 16 + 4 |
|
cmp width, #16 |
|
mov w10, #0 // start filter_y or not |
|
add height, height, #1 |
|
sub dst, dst, #(VVC_MAX_PB_SIZE * 2) |
|
sub src_stride, src_stride, x6 |
|
cset w15, gt // width > 16 |
|
1: |
|
mov x12, tmp0 |
|
mov x13, tmp1 |
|
mov x14, dst |
|
cbz w15, 2f |
|
|
|
// width > 16 |
|
ldur q5, [src, #1] |
|
ldr q4, [src], #16 |
|
uxtl v7.8h, v5.8b |
|
uxtl2 v17.8h, v5.16b |
|
uxtl v6.8h, v4.8b |
|
uxtl2 v16.8h, v4.16b |
|
mul v6.8h, v6.8h, v0.8h |
|
mul v16.8h, v16.8h, v0.8h |
|
mla v6.8h, v7.8h, v1.8h |
|
mla v16.8h, v17.8h, v1.8h |
|
add v6.8h, v6.8h, v30.8h |
|
add v16.8h, v16.8h, v30.8h |
|
ushr v6.8h, v6.8h, #(8 - 6) |
|
ushr v7.8h, v16.8h, #(8 - 6) |
|
stp q6, q7, [x13], #32 |
|
|
|
cbz w10, 3f |
|
|
|
ldp q16, q17, [x12], #32 |
|
mul v16.8h, v16.8h, v2.8h |
|
mul v17.8h, v17.8h, v2.8h |
|
mla v16.8h, v6.8h, v3.8h |
|
mla v17.8h, v7.8h, v3.8h |
|
add v16.8h, v16.8h, v31.8h |
|
add v17.8h, v17.8h, v31.8h |
|
ushr v16.8h, v16.8h, #4 |
|
ushr v17.8h, v17.8h, #4 |
|
stp q16, q17, [x14], #32 |
|
b 3f |
|
2: |
|
// width > 8 |
|
ldur d5, [src, #1] |
|
ldr d4, [src], #8 |
|
uxtl v7.8h, v5.8b |
|
uxtl v6.8h, v4.8b |
|
mul v6.8h, v6.8h, v0.8h |
|
mla v6.8h, v7.8h, v1.8h |
|
add v6.8h, v6.8h, v30.8h |
|
ushr v6.8h, v6.8h, #(8 - 6) |
|
str q6, [x13], #16 |
|
|
|
cbz w10, 3f |
|
|
|
ldr q16, [x12], #16 |
|
mul v16.8h, v16.8h, v2.8h |
|
mla v16.8h, v6.8h, v3.8h |
|
add v16.8h, v16.8h, v31.8h |
|
ushr v16.8h, v16.8h, #4 |
|
str q16, [x14], #16 |
|
3: |
|
ldr s5, [src, #1] |
|
ldr s4, [src], #4 |
|
uxtl v7.8h, v5.8b |
|
uxtl v6.8h, v4.8b |
|
mul v6.4h, v6.4h, v0.4h |
|
mla v6.4h, v7.4h, v1.4h |
|
add v6.4h, v6.4h, v30.4h |
|
ushr v6.4h, v6.4h, #(8 - 6) |
|
str d6, [x13], #8 |
|
|
|
cbz w10, 4f |
|
|
|
ldr d16, [x12], #8 |
|
mul v16.4h, v16.4h, v2.4h |
|
mla v16.4h, v6.4h, v3.4h |
|
add v16.4h, v16.4h, v31.4h |
|
ushr v16.4h, v16.4h, #4 |
|
str d16, [x14], #8 |
|
4: |
|
subs height, height, #1 |
|
mov w10, #1 |
|
add src, src, src_stride |
|
add dst, dst, #(VVC_MAX_PB_SIZE * 2) |
|
eor tmp0, tmp0, tmp1 |
|
eor tmp1, tmp0, tmp1 |
|
eor tmp0, tmp0, tmp1 |
|
b.ne 1b |
|
|
|
add sp, sp, #(VVC_MAX_PB_SIZE * 4) |
|
ret |
|
endfunc |
|
|
|
function ff_vvc_dmvr_hv_12_neon, export=1 |
|
movi v29.4s, #(12 - 6) |
|
movi v30.4s, #(1 << (12 - 7)) // offset1 |
|
b 0f |
|
endfunc |
|
|
|
function ff_vvc_dmvr_hv_10_neon, export=1 |
|
movi v29.4s, #(10 - 6) |
|
movi v30.4s, #(1 << (10 - 7)) // offset1 |
|
0: |
|
movi v31.4s, #8 // offset2 |
|
neg v29.4s, v29.4s |
|
|
|
sub sp, sp, #(VVC_MAX_PB_SIZE * 4) |
|
|
|
movrel x9, X(ff_vvc_inter_luma_dmvr_filters) |
|
add x12, x9, mx, lsl #1 |
|
ldrb w10, [x12] |
|
ldrb w11, [x12, #1] |
|
mov tmp0, sp |
|
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2) |
|
// We know the value are positive |
|
dup v0.8h, w10 // filter_x[0] |
|
dup v1.8h, w11 // filter_x[1] |
|
|
|
add x12, x9, my, lsl #1 |
|
ldrb w10, [x12] |
|
ldrb w11, [x12, #1] |
|
sxtw x6, w6 |
|
dup v2.8h, w10 // filter_y[0] |
|
dup v3.8h, w11 // filter_y[1] |
|
|
|
// Valid value for width can only be 8 + 4, 16 + 4 |
|
cmp width, #16 |
|
mov w10, #0 // start filter_y or not |
|
add height, height, #1 |
|
sub dst, dst, #(VVC_MAX_PB_SIZE * 2) |
|
sub src_stride, src_stride, x6, lsl #1 |
|
cset w15, gt // width > 16 |
|
1: |
|
mov x12, tmp0 |
|
mov x13, tmp1 |
|
mov x14, dst |
|
cbz w15, 2f |
|
|
|
// width > 16 |
|
add x16, src, #2 |
|
ldp q6, q16, [src], #32 |
|
ldp q7, q17, [x16] |
|
umull v4.4s, v6.4h, v0.4h |
|
umull2 v5.4s, v6.8h, v0.8h |
|
umull v18.4s, v16.4h, v0.4h |
|
umull2 v19.4s, v16.8h, v0.8h |
|
umlal v4.4s, v7.4h, v1.4h |
|
umlal2 v5.4s, v7.8h, v1.8h |
|
umlal v18.4s, v17.4h, v1.4h |
|
umlal2 v19.4s, v17.8h, v1.8h |
|
|
|
add v4.4s, v4.4s, v30.4s |
|
add v5.4s, v5.4s, v30.4s |
|
add v18.4s, v18.4s, v30.4s |
|
add v19.4s, v19.4s, v30.4s |
|
ushl v4.4s, v4.4s, v29.4s |
|
ushl v5.4s, v5.4s, v29.4s |
|
ushl v18.4s, v18.4s, v29.4s |
|
ushl v19.4s, v19.4s, v29.4s |
|
uqxtn v6.4h, v4.4s |
|
uqxtn2 v6.8h, v5.4s |
|
uqxtn v7.4h, v18.4s |
|
uqxtn2 v7.8h, v19.4s |
|
stp q6, q7, [x13], #32 |
|
|
|
cbz w10, 3f |
|
|
|
ldp q4, q5, [x12], #32 |
|
umull v17.4s, v4.4h, v2.4h |
|
umull2 v18.4s, v4.8h, v2.8h |
|
umull v19.4s, v5.4h, v2.4h |
|
umull2 v20.4s, v5.8h, v2.8h |
|
umlal v17.4s, v6.4h, v3.4h |
|
umlal2 v18.4s, v6.8h, v3.8h |
|
umlal v19.4s, v7.4h, v3.4h |
|
umlal2 v20.4s, v7.8h, v3.8h |
|
add v17.4s, v17.4s, v31.4s |
|
add v18.4s, v18.4s, v31.4s |
|
add v19.4s, v19.4s, v31.4s |
|
add v20.4s, v20.4s, v31.4s |
|
ushr v17.4s, v17.4s, #4 |
|
ushr v18.4s, v18.4s, #4 |
|
ushr v19.4s, v19.4s, #4 |
|
ushr v20.4s, v20.4s, #4 |
|
uqxtn v6.4h, v17.4s |
|
uqxtn2 v6.8h, v18.4s |
|
uqxtn v7.4h, v19.4s |
|
uqxtn2 v7.8h, v20.4s |
|
stp q6, q7, [x14], #32 |
|
b 3f |
|
2: |
|
// width > 8 |
|
ldur q7, [src, #2] |
|
ldr q6, [src], #16 |
|
umull v4.4s, v6.4h, v0.4h |
|
umull2 v5.4s, v6.8h, v0.8h |
|
umlal v4.4s, v7.4h, v1.4h |
|
umlal2 v5.4s, v7.8h, v1.8h |
|
|
|
add v4.4s, v4.4s, v30.4s |
|
add v5.4s, v5.4s, v30.4s |
|
ushl v4.4s, v4.4s, v29.4s |
|
ushl v5.4s, v5.4s, v29.4s |
|
uqxtn v6.4h, v4.4s |
|
uqxtn2 v6.8h, v5.4s |
|
str q6, [x13], #16 |
|
|
|
cbz w10, 3f |
|
|
|
ldr q16, [x12], #16 |
|
umull v17.4s, v16.4h, v2.4h |
|
umull2 v18.4s, v16.8h, v2.8h |
|
umlal v17.4s, v6.4h, v3.4h |
|
umlal2 v18.4s, v6.8h, v3.8h |
|
add v17.4s, v17.4s, v31.4s |
|
add v18.4s, v18.4s, v31.4s |
|
ushr v17.4s, v17.4s, #4 |
|
ushr v18.4s, v18.4s, #4 |
|
uqxtn v16.4h, v17.4s |
|
uqxtn2 v16.8h, v18.4s |
|
str q16, [x14], #16 |
|
3: |
|
ldr d7, [src, #2] |
|
ldr d6, [src], #8 |
|
umull v4.4s, v7.4h, v1.4h |
|
umlal v4.4s, v6.4h, v0.4h |
|
add v4.4s, v4.4s, v30.4s |
|
ushl v4.4s, v4.4s, v29.4s |
|
uqxtn v6.4h, v4.4s |
|
str d6, [x13], #8 |
|
|
|
cbz w10, 4f |
|
|
|
ldr d16, [x12], #8 |
|
umull v17.4s, v16.4h, v2.4h |
|
umlal v17.4s, v6.4h, v3.4h |
|
add v17.4s, v17.4s, v31.4s |
|
ushr v17.4s, v17.4s, #4 |
|
uqxtn v16.4h, v17.4s |
|
str d16, [x14], #8 |
|
4: |
|
subs height, height, #1 |
|
mov w10, #1 |
|
add src, src, src_stride |
|
add dst, dst, #(VVC_MAX_PB_SIZE * 2) |
|
eor tmp0, tmp0, tmp1 |
|
eor tmp1, tmp0, tmp1 |
|
eor tmp0, tmp0, tmp1 |
|
b.ne 1b |
|
|
|
add sp, sp, #(VVC_MAX_PB_SIZE * 4) |
|
ret |
|
|
|
.unreq dst |
|
.unreq src |
|
.unreq src_stride |
|
.unreq height |
|
.unreq mx |
|
.unreq my |
|
.unreq width |
|
.unreq tmp0 |
|
.unreq tmp1 |
|
endfunc
|
|
|