aarch64: Reindent all assembly to 8/24 column indentation

libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally
uses a layered indentation style to visually show how different
unrolled/interleaved phases fit together.

Signed-off-by: Martin Storsjö <martin@martin.st>
pull/390/head
Martin Storsjö 1 year ago
parent cada4597ca
commit a76b409dd0
  1. 218
      libavcodec/aarch64/aacpsdsp_neon.S
  2. 102
      libavcodec/aarch64/opusdsp_neon.S
  3. 80
      libswresample/aarch64/resample.S
  4. 2250
      libswscale/aarch64/hscale.S
  5. 330
      libswscale/aarch64/output.S
  6. 220
      libswscale/aarch64/yuv2rgb_neon.S

@ -19,130 +19,130 @@
#include "libavutil/aarch64/asm.S"
function ff_ps_add_squares_neon, export=1
1: ld1 {v0.4s,v1.4s}, [x1], #32
fmul v0.4s, v0.4s, v0.4s
fmul v1.4s, v1.4s, v1.4s
faddp v2.4s, v0.4s, v1.4s
ld1 {v3.4s}, [x0]
fadd v3.4s, v3.4s, v2.4s
st1 {v3.4s}, [x0], #16
subs w2, w2, #4
b.gt 1b
1: ld1 {v0.4s,v1.4s}, [x1], #32
fmul v0.4s, v0.4s, v0.4s
fmul v1.4s, v1.4s, v1.4s
faddp v2.4s, v0.4s, v1.4s
ld1 {v3.4s}, [x0]
fadd v3.4s, v3.4s, v2.4s
st1 {v3.4s}, [x0], #16
subs w2, w2, #4
b.gt 1b
ret
endfunc
function ff_ps_mul_pair_single_neon, export=1
1: ld1 {v0.4s,v1.4s}, [x1], #32
ld1 {v2.4s}, [x2], #16
zip1 v3.4s, v2.4s, v2.4s
zip2 v4.4s, v2.4s, v2.4s
fmul v0.4s, v0.4s, v3.4s
fmul v1.4s, v1.4s, v4.4s
st1 {v0.4s,v1.4s}, [x0], #32
subs w3, w3, #4
b.gt 1b
1: ld1 {v0.4s,v1.4s}, [x1], #32
ld1 {v2.4s}, [x2], #16
zip1 v3.4s, v2.4s, v2.4s
zip2 v4.4s, v2.4s, v2.4s
fmul v0.4s, v0.4s, v3.4s
fmul v1.4s, v1.4s, v4.4s
st1 {v0.4s,v1.4s}, [x0], #32
subs w3, w3, #4
b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_neon, export=1
ld1 {v0.4s}, [x2]
ld1 {v1.4s}, [x3]
zip1 v4.4s, v0.4s, v0.4s
zip2 v5.4s, v0.4s, v0.4s
zip1 v6.4s, v1.4s, v1.4s
zip2 v7.4s, v1.4s, v1.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v4.4s, v4.4s, v6.4s
fadd v5.4s, v5.4s, v7.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v2.4s, v2.4s, v4.4s
fmla v2.4s, v3.4s, v5.4s
st1 {v2.d}[0], [x0], #8
st1 {v2.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ld1 {v0.4s}, [x2]
ld1 {v1.4s}, [x3]
zip1 v4.4s, v0.4s, v0.4s
zip2 v5.4s, v0.4s, v0.4s
zip1 v6.4s, v1.4s, v1.4s
zip2 v7.4s, v1.4s, v1.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v4.4s, v4.4s, v6.4s
fadd v5.4s, v5.4s, v7.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v2.4s, v2.4s, v4.4s
fmla v2.4s, v3.4s, v5.4s
st1 {v2.d}[0], [x0], #8
st1 {v2.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
ld1 {v0.4s,v1.4s}, [x2]
ld1 {v6.4s,v7.4s}, [x3]
fneg v2.4s, v1.4s
fneg v3.4s, v7.4s
zip1 v16.4s, v0.4s, v0.4s
zip2 v17.4s, v0.4s, v0.4s
zip1 v18.4s, v2.4s, v1.4s
zip2 v19.4s, v2.4s, v1.4s
zip1 v20.4s, v6.4s, v6.4s
zip2 v21.4s, v6.4s, v6.4s
zip1 v22.4s, v3.4s, v7.4s
zip2 v23.4s, v3.4s, v7.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v16.4s, v16.4s, v20.4s
fadd v17.4s, v17.4s, v21.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v4.4s, v2.4s, v16.4s
fmla v4.4s, v3.4s, v17.4s
fadd v18.4s, v18.4s, v22.4s
fadd v19.4s, v19.4s, v23.4s
ext v2.16b, v2.16b, v2.16b, #4
ext v3.16b, v3.16b, v3.16b, #4
fmla v4.4s, v2.4s, v18.4s
fmla v4.4s, v3.4s, v19.4s
st1 {v4.d}[0], [x0], #8
st1 {v4.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ld1 {v0.4s,v1.4s}, [x2]
ld1 {v6.4s,v7.4s}, [x3]
fneg v2.4s, v1.4s
fneg v3.4s, v7.4s
zip1 v16.4s, v0.4s, v0.4s
zip2 v17.4s, v0.4s, v0.4s
zip1 v18.4s, v2.4s, v1.4s
zip2 v19.4s, v2.4s, v1.4s
zip1 v20.4s, v6.4s, v6.4s
zip2 v21.4s, v6.4s, v6.4s
zip1 v22.4s, v3.4s, v7.4s
zip2 v23.4s, v3.4s, v7.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v16.4s, v16.4s, v20.4s
fadd v17.4s, v17.4s, v21.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v4.4s, v2.4s, v16.4s
fmla v4.4s, v3.4s, v17.4s
fadd v18.4s, v18.4s, v22.4s
fadd v19.4s, v19.4s, v23.4s
ext v2.16b, v2.16b, v2.16b, #4
ext v3.16b, v3.16b, v3.16b, #4
fmla v4.4s, v2.4s, v18.4s
fmla v4.4s, v3.4s, v19.4s
st1 {v4.d}[0], [x0], #8
st1 {v4.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ret
endfunc
function ff_ps_hybrid_analysis_neon, export=1
lsl x3, x3, #3
ld2 {v0.4s,v1.4s}, [x1], #32
ld2 {v2.2s,v3.2s}, [x1], #16
ld1 {v24.2s}, [x1], #8
ld2 {v4.2s,v5.2s}, [x1], #16
ld2 {v6.4s,v7.4s}, [x1]
rev64 v6.4s, v6.4s
rev64 v7.4s, v7.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #8
rev64 v4.2s, v4.2s
rev64 v5.2s, v5.2s
mov v2.d[1], v3.d[0]
mov v4.d[1], v5.d[0]
mov v5.d[1], v2.d[0]
mov v3.d[1], v4.d[0]
fadd v16.4s, v0.4s, v6.4s
fadd v17.4s, v1.4s, v7.4s
fsub v18.4s, v1.4s, v7.4s
fsub v19.4s, v0.4s, v6.4s
fadd v22.4s, v2.4s, v4.4s
fsub v23.4s, v5.4s, v3.4s
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
1: ld2 {v2.4s,v3.4s}, [x2], #32
ld2 {v4.2s,v5.2s}, [x2], #16
ld1 {v6.2s}, [x2], #8
add x2, x2, #8
mov v4.d[1], v5.d[0]
mov v6.s[1], v6.s[0]
fmul v6.2s, v6.2s, v24.2s
fmul v0.4s, v2.4s, v16.4s
fmul v1.4s, v2.4s, v17.4s
fmls v0.4s, v3.4s, v18.4s
fmla v1.4s, v3.4s, v19.4s
fmla v0.4s, v4.4s, v20.4s
fmla v1.4s, v4.4s, v21.4s
faddp v0.4s, v0.4s, v1.4s
faddp v0.4s, v0.4s, v0.4s
fadd v0.2s, v0.2s, v6.2s
st1 {v0.2s}, [x0], x3
subs w4, w4, #1
b.gt 1b
lsl x3, x3, #3
ld2 {v0.4s,v1.4s}, [x1], #32
ld2 {v2.2s,v3.2s}, [x1], #16
ld1 {v24.2s}, [x1], #8
ld2 {v4.2s,v5.2s}, [x1], #16
ld2 {v6.4s,v7.4s}, [x1]
rev64 v6.4s, v6.4s
rev64 v7.4s, v7.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #8
rev64 v4.2s, v4.2s
rev64 v5.2s, v5.2s
mov v2.d[1], v3.d[0]
mov v4.d[1], v5.d[0]
mov v5.d[1], v2.d[0]
mov v3.d[1], v4.d[0]
fadd v16.4s, v0.4s, v6.4s
fadd v17.4s, v1.4s, v7.4s
fsub v18.4s, v1.4s, v7.4s
fsub v19.4s, v0.4s, v6.4s
fadd v22.4s, v2.4s, v4.4s
fsub v23.4s, v5.4s, v3.4s
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
1: ld2 {v2.4s,v3.4s}, [x2], #32
ld2 {v4.2s,v5.2s}, [x2], #16
ld1 {v6.2s}, [x2], #8
add x2, x2, #8
mov v4.d[1], v5.d[0]
mov v6.s[1], v6.s[0]
fmul v6.2s, v6.2s, v24.2s
fmul v0.4s, v2.4s, v16.4s
fmul v1.4s, v2.4s, v17.4s
fmls v0.4s, v3.4s, v18.4s
fmla v1.4s, v3.4s, v19.4s
fmla v0.4s, v4.4s, v20.4s
fmla v1.4s, v4.4s, v21.4s
faddp v0.4s, v0.4s, v1.4s
faddp v0.4s, v0.4s, v0.4s
fadd v0.2s, v0.2s, v6.2s
st1 {v0.2s}, [x0], x3
subs w4, w4, #1
b.gt 1b
ret
endfunc

@ -33,81 +33,81 @@ const tab_x2, align=4
endconst
function ff_opus_deemphasis_neon, export=1
movrel x4, tab_st
ld1 {v4.4s}, [x4]
movrel x4, tab_x0
ld1 {v5.4s}, [x4]
movrel x4, tab_x1
ld1 {v6.4s}, [x4]
movrel x4, tab_x2
ld1 {v7.4s}, [x4]
movrel x4, tab_st
ld1 {v4.4s}, [x4]
movrel x4, tab_x0
ld1 {v5.4s}, [x4]
movrel x4, tab_x1
ld1 {v6.4s}, [x4]
movrel x4, tab_x2
ld1 {v7.4s}, [x4]
fmul v0.4s, v4.4s, v0.s[0]
fmul v0.4s, v4.4s, v0.s[0]
1: ld1 {v1.4s, v2.4s}, [x1], #32
1: ld1 {v1.4s, v2.4s}, [x1], #32
fmla v0.4s, v5.4s, v1.s[0]
fmul v3.4s, v7.4s, v2.s[2]
fmla v0.4s, v5.4s, v1.s[0]
fmul v3.4s, v7.4s, v2.s[2]
fmla v0.4s, v6.4s, v1.s[1]
fmla v3.4s, v6.4s, v2.s[1]
fmla v0.4s, v6.4s, v1.s[1]
fmla v3.4s, v6.4s, v2.s[1]
fmla v0.4s, v7.4s, v1.s[2]
fmla v3.4s, v5.4s, v2.s[0]
fmla v0.4s, v7.4s, v1.s[2]
fmla v3.4s, v5.4s, v2.s[0]
fadd v1.4s, v1.4s, v0.4s
fadd v2.4s, v2.4s, v3.4s
fadd v1.4s, v1.4s, v0.4s
fadd v2.4s, v2.4s, v3.4s
fmla v2.4s, v4.4s, v1.s[3]
fmla v2.4s, v4.4s, v1.s[3]
st1 {v1.4s, v2.4s}, [x0], #32
fmul v0.4s, v4.4s, v2.s[3]
st1 {v1.4s, v2.4s}, [x0], #32
fmul v0.4s, v4.4s, v2.s[3]
subs w2, w2, #8
b.gt 1b
subs w2, w2, #8
b.gt 1b
mov s0, v2.s[3]
mov s0, v2.s[3]
ret
endfunc
function ff_opus_postfilter_neon, export=1
ld1 {v0.4s}, [x2]
dup v1.4s, v0.s[1]
dup v2.4s, v0.s[2]
dup v0.4s, v0.s[0]
ld1 {v0.4s}, [x2]
dup v1.4s, v0.s[1]
dup v2.4s, v0.s[2]
dup v0.4s, v0.s[0]
add w1, w1, #2
sub x1, x0, x1, lsl #2
add w1, w1, #2
sub x1, x0, x1, lsl #2
ld1 {v3.4s}, [x1]
fmul v3.4s, v3.4s, v2.4s
ld1 {v3.4s}, [x1]
fmul v3.4s, v3.4s, v2.4s
1: add x1, x1, #4
ld1 {v4.4s}, [x1]
add x1, x1, #4
ld1 {v5.4s}, [x1]
add x1, x1, #4
ld1 {v6.4s}, [x1]
add x1, x1, #4
ld1 {v7.4s}, [x1]
1: add x1, x1, #4
ld1 {v4.4s}, [x1]
add x1, x1, #4
ld1 {v5.4s}, [x1]
add x1, x1, #4
ld1 {v6.4s}, [x1]
add x1, x1, #4
ld1 {v7.4s}, [x1]
fmla v3.4s, v7.4s, v2.4s
fadd v6.4s, v6.4s, v4.4s
fmla v3.4s, v7.4s, v2.4s
fadd v6.4s, v6.4s, v4.4s
ld1 {v4.4s}, [x0]
fmla v4.4s, v5.4s, v0.4s
ld1 {v4.4s}, [x0]
fmla v4.4s, v5.4s, v0.4s
fmul v6.4s, v6.4s, v1.4s
fadd v6.4s, v6.4s, v3.4s
fmul v6.4s, v6.4s, v1.4s
fadd v6.4s, v6.4s, v3.4s
fadd v4.4s, v4.4s, v6.4s
fmul v3.4s, v7.4s, v2.4s
fadd v4.4s, v4.4s, v6.4s
fmul v3.4s, v7.4s, v2.4s
st1 {v4.4s}, [x0], #16
st1 {v4.4s}, [x0], #16
subs w3, w3, #4
b.gt 1b
subs w3, w3, #4
b.gt 1b
ret
endfunc

@ -21,57 +21,57 @@
#include "libavutil/aarch64/asm.S"
function ff_resample_common_apply_filter_x4_float_neon, export=1
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x8_float_neon, export=1
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
ld1 {v3.4s}, [x1], #16 // src[4..7]
ld1 {v4.4s}, [x2], #16 // filter[4..7]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
ld1 {v3.4s}, [x1], #16 // src[4..7]
ld1 {v4.4s}, [x2], #16 // filter[4..7]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x4_s16_neon, export=1
movi v0.4s, #0 // accumulator
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
ld1 {v2.4h}, [x2], #8 // filter[0..3]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
movi v0.4s, #0 // accumulator
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
ld1 {v2.4h}, [x2], #8 // filter[0..3]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x8_s16_neon, export=1
movi v0.4s, #0 // accumulator
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
ld1 {v2.8h}, [x2], #16 // filter[0..7]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
movi v0.4s, #0 // accumulator
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
ld1 {v2.8h}, [x2], #16 // filter[0..7]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc

File diff suppressed because it is too large Load Diff

@ -29,178 +29,178 @@ function ff_yuv2planeX_8_neon, export=1
// x5 - const uint8_t *dither,
// w6 - int offset
ld1 {v0.8b}, [x5] // load 8x8-bit dither
and w6, w6, #7
cbz w6, 1f // check if offsetting present
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8h, v0.8b // extend dither to 16-bit
ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
cmp w1, #8 // if filterSize == 8, branch to specialized version
b.eq 6f
cmp w1, #4 // if filterSize == 4, branch to specialized version
b.eq 8f
cmp w1, #2 // if filterSize == 2, branch to specialized version
b.eq 10f
ld1 {v0.8b}, [x5] // load 8x8-bit dither
and w6, w6, #7
cbz w6, 1f // check if offsetting present
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8h, v0.8b // extend dither to 16-bit
ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
cmp w1, #8 // if filterSize == 8, branch to specialized version
b.eq 6f
cmp w1, #4 // if filterSize == 4, branch to specialized version
b.eq 8f
cmp w1, #2 // if filterSize == 2, branch to specialized version
b.eq 10f
// The filter size does not match of the of specialized implementations. It is either even or odd. If it is even
// then use the first section below.
mov x7, #0 // i = 0
tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
mov x7, #0 // i = 0
tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
// fs % 2 == 0
2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
add x11, x11, x7, lsl #1 // &src[j ][i]
add x12, x12, x7, lsl #1 // &src[j+1][i]
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
subs w8, w8, #2 // tmpfilterSize -= 2
b.gt 3b // loop until filterSize consumed
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
add x7, x7, #8 // i += 8
b.gt 2b // loop until width consumed
2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1]
ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
add x11, x11, x7, lsl #1 // &src[j ][i]
add x12, x12, x7, lsl #1 // &src[j+1][i]
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
subs w8, w8, #2 // tmpfilterSize -= 2
b.gt 3b // loop until filterSize consumed
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
add x7, x7, #8 // i += 8
b.gt 2b // loop until width consumed
ret
// If filter size is odd (most likely == 1), then use this section.
// fs % 2 != 0
4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
5: ldr x11, [x9], #8 // get 1 pointer: src[j]
ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
add x11, x11, x7, lsl #1 // &src[j ][i]
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
subs w8, w8, #1 // tmpfilterSize -= 2
b.gt 5b // loop until filterSize consumed
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
add x7, x7, #8 // i += 8
b.gt 4b // loop until width consumed
4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter
5: ldr x11, [x9], #8 // get 1 pointer: src[j]
ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
add x11, x11, x7, lsl #1 // &src[j ][i]
ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
subs w8, w8, #1 // tmpfilterSize -= 2
b.gt 5b // loop until filterSize consumed
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
add x7, x7, #8 // i += 8
b.gt 4b // loop until width consumed
ret
6: // fs=8
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5]
ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5]
ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
// load 8x16-bit values for filter[j], where j=0..7
ld1 {v6.8h}, [x0]
ld1 {v6.8h}, [x0]
7:
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
subs w4, w4, #8 // dstW -= 8
st1 {v3.8b}, [x3], #8 // write to destination
b.gt 7b // loop until width consumed
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
subs w4, w4, #8 // dstW -= 8
st1 {v3.8b}, [x3], #8 // write to destination
b.gt 7b // loop until width consumed
ret
8: // fs=4
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
// load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes
ld1 {v6.4h}, [x0]
ld1 {v6.4h}, [x0]
9:
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
b.gt 9b // loop until width consumed
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
b.gt 9b // loop until width consumed
ret
10: // fs=2
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1]
// load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes
ldr s6, [x0]
ldr s6, [x0]
11:
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
b.gt 11b // loop until width consumed
mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
st1 {v3.8b}, [x3], #8 // write to destination
subs w4, w4, #8 // dstW -= 8
b.gt 11b // loop until width consumed
ret
endfunc
@ -210,25 +210,25 @@ function ff_yuv2plane1_8_neon, export=1
// w2 - int dstW,
// x3 - const uint8_t *dither,
// w4 - int offset
ld1 {v0.8b}, [x3] // load 8x8-bit dither
and w4, w4, #7
cbz w4, 1f // check if offsetting present
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8h, v0.8b // extend dither to 32-bit
uxtl v1.4s, v0.4h
uxtl2 v2.4s, v0.8h
ld1 {v0.8b}, [x3] // load 8x8-bit dither
and w4, w4, #7
cbz w4, 1f // check if offsetting present
ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8h, v0.8b // extend dither to 32-bit
uxtl v1.4s, v0.4h
uxtl2 v2.4s, v0.8h
2:
ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
sxtl v4.4s, v3.4h
sxtl2 v5.4s, v3.8h
add v4.4s, v4.4s, v1.4s
add v5.4s, v5.4s, v2.4s
sqshrun v4.4h, v4.4s, #6
sqshrun2 v4.8h, v5.4s, #6
uqshrn v3.8b, v4.8h, #1 // clip8(val>>7)
subs w2, w2, #8 // dstW -= 8
st1 {v3.8b}, [x1], #8 // write to destination
b.gt 2b // loop until width consumed
ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
sxtl v4.4s, v3.4h
sxtl2 v5.4s, v3.8h
add v4.4s, v4.4s, v1.4s
add v5.4s, v5.4s, v2.4s
sqshrun v4.4h, v4.4s, #6
sqshrun2 v4.8h, v5.4s, #6
uqshrn v3.8b, v4.8h, #1 // clip8(val>>7)
subs w2, w2, #8 // dstW -= 8
st1 {v3.8b}, [x1], #8 // write to destination
b.gt 2b // loop until width consumed
ret
endfunc

@ -23,23 +23,23 @@
.macro load_yoff_ycoeff yoff ycoeff
#if defined(__APPLE__)
ldp w9, w10, [sp, #\yoff]
ldp w9, w10, [sp, #\yoff]
#else
ldr w9, [sp, #\yoff]
ldr w10, [sp, #\ycoeff]
ldr w9, [sp, #\yoff]
ldr w10, [sp, #\ycoeff]
#endif
.endm
.macro load_args_nv12
ldr x8, [sp] // table
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
neg w11, w0
ldr x8, [sp] // table
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
neg w11, w0
.endm
.macro load_args_nv21
@ -47,52 +47,52 @@
.endm
.macro load_args_yuv420p
ldr x13, [sp] // srcV
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
lsr w11, w0, #1
neg w11, w11
ldr x13, [sp] // srcV
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
lsr w11, w0, #1
neg w11, w11
.endm
.macro load_args_yuv422p
ldr x13, [sp] // srcV
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
ldr x13, [sp] // srcV
ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1d}, [x8]
dup v0.8h, w10
dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
.endm
.macro load_chroma_nv12
ld2 {v16.8b, v17.8b}, [x6], #16
ushll v18.8h, v16.8b, #3
ushll v19.8h, v17.8b, #3
ld2 {v16.8b, v17.8b}, [x6], #16
ushll v18.8h, v16.8b, #3
ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_nv21
ld2 {v16.8b, v17.8b}, [x6], #16
ushll v19.8h, v16.8b, #3
ushll v18.8h, v17.8b, #3
ld2 {v16.8b, v17.8b}, [x6], #16
ushll v19.8h, v16.8b, #3
ushll v18.8h, v17.8b, #3
.endm
.macro load_chroma_yuv420p
ld1 {v16.8b}, [ x6], #8
ld1 {v17.8b}, [x13], #8
ushll v18.8h, v16.8b, #3
ushll v19.8h, v17.8b, #3
ld1 {v16.8b}, [ x6], #8
ld1 {v17.8b}, [x13], #8
ushll v18.8h, v16.8b, #3
ushll v19.8h, v17.8b, #3
.endm
.macro load_chroma_yuv422p
@ -100,9 +100,9 @@
.endm
.macro increment_nv12
ands w15, w1, #1
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
add x6, x6, w16, sxtw // srcC += incC
ands w15, w1, #1
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
add x6, x6, w16, sxtw // srcC += incC
.endm
.macro increment_nv21
@ -110,100 +110,100 @@
.endm
.macro increment_yuv420p
ands w15, w1, #1
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
add x6, x6, w16, sxtw // srcU += incU
add x13, x13, w17, sxtw // srcV += incV
ands w15, w1, #1
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
add x6, x6, w16, sxtw // srcU += incU
add x13, x13, w17, sxtw // srcV += incV
.endm
.macro increment_yuv422p
add x6, x6, w7, sxtw // srcU += incU
add x13, x13, w14, sxtw // srcV += incV
add x6, x6, w7, sxtw // srcU += incU
add x13, x13, w14, sxtw // srcV += incV
.endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
add v20.8h, v26.8h, v20.8h // Y1 + R1
add v21.8h, v27.8h, v21.8h // Y2 + R2
add v22.8h, v26.8h, v22.8h // Y1 + G1
add v23.8h, v27.8h, v23.8h // Y2 + G2
add v24.8h, v26.8h, v24.8h // Y1 + B1
add v25.8h, v27.8h, v25.8h // Y2 + B2
sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
movi \a1, #255
movi \a2, #255
add v20.8h, v26.8h, v20.8h // Y1 + R1
add v21.8h, v27.8h, v21.8h // Y2 + R2
add v22.8h, v26.8h, v22.8h // Y1 + G1
add v23.8h, v27.8h, v23.8h // Y2 + G2
add v24.8h, v26.8h, v24.8h // Y1 + B1
add v25.8h, v27.8h, v25.8h // Y2 + B2
sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
movi \a1, #255
movi \a2, #255
.endm
.macro declare_func ifmt ofmt
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
load_args_\ifmt
mov w9, w1
mov w9, w1
1:
mov w8, w0 // w8 = width
mov w8, w0 // w8 = width
2:
movi v5.8h, #4, lsl #8 // 128 * (1<<3)
movi v5.8h, #4, lsl #8 // 128 * (1<<3)
load_chroma_\ifmt
sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
zip2 v21.8h, v20.8h, v20.8h // R2
zip1 v20.8h, v20.8h, v20.8h // R1
zip2 v23.8h, v22.8h, v22.8h // G2
zip1 v22.8h, v22.8h, v22.8h // G1
zip2 v25.8h, v24.8h, v24.8h // B2
zip1 v24.8h, v24.8h, v24.8h // B1
ld1 {v2.16b}, [x4], #16 // load luma
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
zip2 v21.8h, v20.8h, v20.8h // R2
zip1 v20.8h, v20.8h, v20.8h // R1
zip2 v23.8h, v22.8h, v22.8h // G2
zip1 v22.8h, v22.8h, v22.8h // G1
zip2 v25.8h, v24.8h, v24.8h // B2
zip1 v24.8h, v24.8h, v24.8h // B1
ld1 {v2.16b}, [x4], #16 // load luma
ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
.ifc \ofmt,argb // 1 2 3 0
compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
.endif
.ifc \ofmt,rgba // 0 1 2 3
compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
.endif
.ifc \ofmt,abgr // 3 2 1 0
compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
.endif
.ifc \ofmt,bgra // 2 1 0 3
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
.endif
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
subs w8, w8, #16 // width -= 16
b.gt 2b
add x2, x2, w3, sxtw // dst += padding
add x4, x4, w5, sxtw // srcY += paddingY
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
subs w8, w8, #16 // width -= 16
b.gt 2b
add x2, x2, w3, sxtw // dst += padding
add x4, x4, w5, sxtw // srcY += paddingY
increment_\ifmt
subs w1, w1, #1 // height -= 1
b.gt 1b
mov w0, w9
subs w1, w1, #1 // height -= 1
b.gt 1b
mov w0, w9
ret
endfunc
.endm
.macro declare_rgb_funcs ifmt
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
declare_func \ifmt, argb
declare_func \ifmt, rgba
declare_func \ifmt, abgr
declare_func \ifmt, bgra
.endm
declare_rgb_funcs nv12

Loading…
Cancel
Save