From a76b409dd030a461b0c5dd1ead8b22d1be560afd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 17 Oct 2023 14:27:17 +0300 Subject: [PATCH] aarch64: Reindent all assembly to 8/24 column indentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally uses a layered indentation style to visually show how different unrolled/interleaved phases fit together. Signed-off-by: Martin Storsjö --- libavcodec/aarch64/aacpsdsp_neon.S | 218 +-- libavcodec/aarch64/opusdsp_neon.S | 102 +- libswresample/aarch64/resample.S | 80 +- libswscale/aarch64/hscale.S | 2250 ++++++++++++++-------------- libswscale/aarch64/output.S | 330 ++-- libswscale/aarch64/yuv2rgb_neon.S | 220 +-- 6 files changed, 1600 insertions(+), 1600 deletions(-) diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S index 686c62eb2e..f8cb0b2959 100644 --- a/libavcodec/aarch64/aacpsdsp_neon.S +++ b/libavcodec/aarch64/aacpsdsp_neon.S @@ -19,130 +19,130 @@ #include "libavutil/aarch64/asm.S" function ff_ps_add_squares_neon, export=1 -1: ld1 {v0.4s,v1.4s}, [x1], #32 - fmul v0.4s, v0.4s, v0.4s - fmul v1.4s, v1.4s, v1.4s - faddp v2.4s, v0.4s, v1.4s - ld1 {v3.4s}, [x0] - fadd v3.4s, v3.4s, v2.4s - st1 {v3.4s}, [x0], #16 - subs w2, w2, #4 - b.gt 1b +1: ld1 {v0.4s,v1.4s}, [x1], #32 + fmul v0.4s, v0.4s, v0.4s + fmul v1.4s, v1.4s, v1.4s + faddp v2.4s, v0.4s, v1.4s + ld1 {v3.4s}, [x0] + fadd v3.4s, v3.4s, v2.4s + st1 {v3.4s}, [x0], #16 + subs w2, w2, #4 + b.gt 1b ret endfunc function ff_ps_mul_pair_single_neon, export=1 -1: ld1 {v0.4s,v1.4s}, [x1], #32 - ld1 {v2.4s}, [x2], #16 - zip1 v3.4s, v2.4s, v2.4s - zip2 v4.4s, v2.4s, v2.4s - fmul v0.4s, v0.4s, v3.4s - fmul v1.4s, v1.4s, v4.4s - st1 {v0.4s,v1.4s}, [x0], #32 - subs w3, w3, #4 - b.gt 1b +1: ld1 {v0.4s,v1.4s}, [x1], #32 + ld1 {v2.4s}, [x2], #16 + zip1 v3.4s, v2.4s, v2.4s + zip2 v4.4s, v2.4s, v2.4s + fmul v0.4s, v0.4s, v3.4s + fmul v1.4s, v1.4s, v4.4s + st1 {v0.4s,v1.4s}, [x0], #32 + subs w3, w3, #4 + b.gt 1b ret endfunc function ff_ps_stereo_interpolate_neon, export=1 - ld1 {v0.4s}, [x2] - ld1 {v1.4s}, [x3] - zip1 v4.4s, v0.4s, v0.4s - zip2 v5.4s, v0.4s, v0.4s - zip1 v6.4s, v1.4s, v1.4s - zip2 v7.4s, v1.4s, v1.4s -1: ld1 {v2.2s}, [x0] - ld1 {v3.2s}, [x1] - fadd v4.4s, v4.4s, v6.4s - fadd v5.4s, v5.4s, v7.4s - mov v2.d[1], v2.d[0] - mov v3.d[1], v3.d[0] - fmul v2.4s, v2.4s, v4.4s - fmla v2.4s, v3.4s, v5.4s - st1 {v2.d}[0], [x0], #8 - st1 {v2.d}[1], [x1], #8 - subs w4, w4, #1 - b.gt 1b + ld1 {v0.4s}, [x2] + ld1 {v1.4s}, [x3] + zip1 v4.4s, v0.4s, v0.4s + zip2 v5.4s, v0.4s, v0.4s + zip1 v6.4s, v1.4s, v1.4s + zip2 v7.4s, v1.4s, v1.4s +1: ld1 {v2.2s}, [x0] + ld1 {v3.2s}, [x1] + fadd v4.4s, v4.4s, v6.4s + fadd v5.4s, v5.4s, v7.4s + mov v2.d[1], v2.d[0] + mov v3.d[1], v3.d[0] + fmul v2.4s, v2.4s, v4.4s + fmla v2.4s, v3.4s, v5.4s + st1 {v2.d}[0], [x0], #8 + st1 {v2.d}[1], [x1], #8 + subs w4, w4, #1 + b.gt 1b ret endfunc function ff_ps_stereo_interpolate_ipdopd_neon, export=1 - ld1 {v0.4s,v1.4s}, [x2] - ld1 {v6.4s,v7.4s}, [x3] - fneg v2.4s, v1.4s - fneg v3.4s, v7.4s - zip1 v16.4s, v0.4s, v0.4s - zip2 v17.4s, v0.4s, v0.4s - zip1 v18.4s, v2.4s, v1.4s - zip2 v19.4s, v2.4s, v1.4s - zip1 v20.4s, v6.4s, v6.4s - zip2 v21.4s, v6.4s, v6.4s - zip1 v22.4s, v3.4s, v7.4s - zip2 v23.4s, v3.4s, v7.4s -1: ld1 {v2.2s}, [x0] - ld1 {v3.2s}, [x1] - fadd v16.4s, v16.4s, v20.4s - fadd v17.4s, v17.4s, v21.4s - mov v2.d[1], v2.d[0] - mov v3.d[1], v3.d[0] - fmul v4.4s, v2.4s, v16.4s - fmla v4.4s, v3.4s, v17.4s - fadd v18.4s, v18.4s, v22.4s - fadd v19.4s, v19.4s, v23.4s - ext v2.16b, v2.16b, v2.16b, #4 - ext v3.16b, v3.16b, v3.16b, #4 - fmla v4.4s, v2.4s, v18.4s - fmla v4.4s, v3.4s, v19.4s - st1 {v4.d}[0], [x0], #8 - st1 {v4.d}[1], [x1], #8 - subs w4, w4, #1 - b.gt 1b + ld1 {v0.4s,v1.4s}, [x2] + ld1 {v6.4s,v7.4s}, [x3] + fneg v2.4s, v1.4s + fneg v3.4s, v7.4s + zip1 v16.4s, v0.4s, v0.4s + zip2 v17.4s, v0.4s, v0.4s + zip1 v18.4s, v2.4s, v1.4s + zip2 v19.4s, v2.4s, v1.4s + zip1 v20.4s, v6.4s, v6.4s + zip2 v21.4s, v6.4s, v6.4s + zip1 v22.4s, v3.4s, v7.4s + zip2 v23.4s, v3.4s, v7.4s +1: ld1 {v2.2s}, [x0] + ld1 {v3.2s}, [x1] + fadd v16.4s, v16.4s, v20.4s + fadd v17.4s, v17.4s, v21.4s + mov v2.d[1], v2.d[0] + mov v3.d[1], v3.d[0] + fmul v4.4s, v2.4s, v16.4s + fmla v4.4s, v3.4s, v17.4s + fadd v18.4s, v18.4s, v22.4s + fadd v19.4s, v19.4s, v23.4s + ext v2.16b, v2.16b, v2.16b, #4 + ext v3.16b, v3.16b, v3.16b, #4 + fmla v4.4s, v2.4s, v18.4s + fmla v4.4s, v3.4s, v19.4s + st1 {v4.d}[0], [x0], #8 + st1 {v4.d}[1], [x1], #8 + subs w4, w4, #1 + b.gt 1b ret endfunc function ff_ps_hybrid_analysis_neon, export=1 - lsl x3, x3, #3 - ld2 {v0.4s,v1.4s}, [x1], #32 - ld2 {v2.2s,v3.2s}, [x1], #16 - ld1 {v24.2s}, [x1], #8 - ld2 {v4.2s,v5.2s}, [x1], #16 - ld2 {v6.4s,v7.4s}, [x1] - rev64 v6.4s, v6.4s - rev64 v7.4s, v7.4s - ext v6.16b, v6.16b, v6.16b, #8 - ext v7.16b, v7.16b, v7.16b, #8 - rev64 v4.2s, v4.2s - rev64 v5.2s, v5.2s - mov v2.d[1], v3.d[0] - mov v4.d[1], v5.d[0] - mov v5.d[1], v2.d[0] - mov v3.d[1], v4.d[0] - fadd v16.4s, v0.4s, v6.4s - fadd v17.4s, v1.4s, v7.4s - fsub v18.4s, v1.4s, v7.4s - fsub v19.4s, v0.4s, v6.4s - fadd v22.4s, v2.4s, v4.4s - fsub v23.4s, v5.4s, v3.4s - trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5} - trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7} -1: ld2 {v2.4s,v3.4s}, [x2], #32 - ld2 {v4.2s,v5.2s}, [x2], #16 - ld1 {v6.2s}, [x2], #8 - add x2, x2, #8 - mov v4.d[1], v5.d[0] - mov v6.s[1], v6.s[0] - fmul v6.2s, v6.2s, v24.2s - fmul v0.4s, v2.4s, v16.4s - fmul v1.4s, v2.4s, v17.4s - fmls v0.4s, v3.4s, v18.4s - fmla v1.4s, v3.4s, v19.4s - fmla v0.4s, v4.4s, v20.4s - fmla v1.4s, v4.4s, v21.4s - faddp v0.4s, v0.4s, v1.4s - faddp v0.4s, v0.4s, v0.4s - fadd v0.2s, v0.2s, v6.2s - st1 {v0.2s}, [x0], x3 - subs w4, w4, #1 - b.gt 1b + lsl x3, x3, #3 + ld2 {v0.4s,v1.4s}, [x1], #32 + ld2 {v2.2s,v3.2s}, [x1], #16 + ld1 {v24.2s}, [x1], #8 + ld2 {v4.2s,v5.2s}, [x1], #16 + ld2 {v6.4s,v7.4s}, [x1] + rev64 v6.4s, v6.4s + rev64 v7.4s, v7.4s + ext v6.16b, v6.16b, v6.16b, #8 + ext v7.16b, v7.16b, v7.16b, #8 + rev64 v4.2s, v4.2s + rev64 v5.2s, v5.2s + mov v2.d[1], v3.d[0] + mov v4.d[1], v5.d[0] + mov v5.d[1], v2.d[0] + mov v3.d[1], v4.d[0] + fadd v16.4s, v0.4s, v6.4s + fadd v17.4s, v1.4s, v7.4s + fsub v18.4s, v1.4s, v7.4s + fsub v19.4s, v0.4s, v6.4s + fadd v22.4s, v2.4s, v4.4s + fsub v23.4s, v5.4s, v3.4s + trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5} + trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7} +1: ld2 {v2.4s,v3.4s}, [x2], #32 + ld2 {v4.2s,v5.2s}, [x2], #16 + ld1 {v6.2s}, [x2], #8 + add x2, x2, #8 + mov v4.d[1], v5.d[0] + mov v6.s[1], v6.s[0] + fmul v6.2s, v6.2s, v24.2s + fmul v0.4s, v2.4s, v16.4s + fmul v1.4s, v2.4s, v17.4s + fmls v0.4s, v3.4s, v18.4s + fmla v1.4s, v3.4s, v19.4s + fmla v0.4s, v4.4s, v20.4s + fmla v1.4s, v4.4s, v21.4s + faddp v0.4s, v0.4s, v1.4s + faddp v0.4s, v0.4s, v0.4s + fadd v0.2s, v0.2s, v6.2s + st1 {v0.2s}, [x0], x3 + subs w4, w4, #1 + b.gt 1b ret endfunc diff --git a/libavcodec/aarch64/opusdsp_neon.S b/libavcodec/aarch64/opusdsp_neon.S index 1c88d7d123..e933151ab4 100644 --- a/libavcodec/aarch64/opusdsp_neon.S +++ b/libavcodec/aarch64/opusdsp_neon.S @@ -33,81 +33,81 @@ const tab_x2, align=4 endconst function ff_opus_deemphasis_neon, export=1 - movrel x4, tab_st - ld1 {v4.4s}, [x4] - movrel x4, tab_x0 - ld1 {v5.4s}, [x4] - movrel x4, tab_x1 - ld1 {v6.4s}, [x4] - movrel x4, tab_x2 - ld1 {v7.4s}, [x4] + movrel x4, tab_st + ld1 {v4.4s}, [x4] + movrel x4, tab_x0 + ld1 {v5.4s}, [x4] + movrel x4, tab_x1 + ld1 {v6.4s}, [x4] + movrel x4, tab_x2 + ld1 {v7.4s}, [x4] - fmul v0.4s, v4.4s, v0.s[0] + fmul v0.4s, v4.4s, v0.s[0] -1: ld1 {v1.4s, v2.4s}, [x1], #32 +1: ld1 {v1.4s, v2.4s}, [x1], #32 - fmla v0.4s, v5.4s, v1.s[0] - fmul v3.4s, v7.4s, v2.s[2] + fmla v0.4s, v5.4s, v1.s[0] + fmul v3.4s, v7.4s, v2.s[2] - fmla v0.4s, v6.4s, v1.s[1] - fmla v3.4s, v6.4s, v2.s[1] + fmla v0.4s, v6.4s, v1.s[1] + fmla v3.4s, v6.4s, v2.s[1] - fmla v0.4s, v7.4s, v1.s[2] - fmla v3.4s, v5.4s, v2.s[0] + fmla v0.4s, v7.4s, v1.s[2] + fmla v3.4s, v5.4s, v2.s[0] - fadd v1.4s, v1.4s, v0.4s - fadd v2.4s, v2.4s, v3.4s + fadd v1.4s, v1.4s, v0.4s + fadd v2.4s, v2.4s, v3.4s - fmla v2.4s, v4.4s, v1.s[3] + fmla v2.4s, v4.4s, v1.s[3] - st1 {v1.4s, v2.4s}, [x0], #32 - fmul v0.4s, v4.4s, v2.s[3] + st1 {v1.4s, v2.4s}, [x0], #32 + fmul v0.4s, v4.4s, v2.s[3] - subs w2, w2, #8 - b.gt 1b + subs w2, w2, #8 + b.gt 1b - mov s0, v2.s[3] + mov s0, v2.s[3] ret endfunc function ff_opus_postfilter_neon, export=1 - ld1 {v0.4s}, [x2] - dup v1.4s, v0.s[1] - dup v2.4s, v0.s[2] - dup v0.4s, v0.s[0] + ld1 {v0.4s}, [x2] + dup v1.4s, v0.s[1] + dup v2.4s, v0.s[2] + dup v0.4s, v0.s[0] - add w1, w1, #2 - sub x1, x0, x1, lsl #2 + add w1, w1, #2 + sub x1, x0, x1, lsl #2 - ld1 {v3.4s}, [x1] - fmul v3.4s, v3.4s, v2.4s + ld1 {v3.4s}, [x1] + fmul v3.4s, v3.4s, v2.4s -1: add x1, x1, #4 - ld1 {v4.4s}, [x1] - add x1, x1, #4 - ld1 {v5.4s}, [x1] - add x1, x1, #4 - ld1 {v6.4s}, [x1] - add x1, x1, #4 - ld1 {v7.4s}, [x1] +1: add x1, x1, #4 + ld1 {v4.4s}, [x1] + add x1, x1, #4 + ld1 {v5.4s}, [x1] + add x1, x1, #4 + ld1 {v6.4s}, [x1] + add x1, x1, #4 + ld1 {v7.4s}, [x1] - fmla v3.4s, v7.4s, v2.4s - fadd v6.4s, v6.4s, v4.4s + fmla v3.4s, v7.4s, v2.4s + fadd v6.4s, v6.4s, v4.4s - ld1 {v4.4s}, [x0] - fmla v4.4s, v5.4s, v0.4s + ld1 {v4.4s}, [x0] + fmla v4.4s, v5.4s, v0.4s - fmul v6.4s, v6.4s, v1.4s - fadd v6.4s, v6.4s, v3.4s + fmul v6.4s, v6.4s, v1.4s + fadd v6.4s, v6.4s, v3.4s - fadd v4.4s, v4.4s, v6.4s - fmul v3.4s, v7.4s, v2.4s + fadd v4.4s, v4.4s, v6.4s + fmul v3.4s, v7.4s, v2.4s - st1 {v4.4s}, [x0], #16 + st1 {v4.4s}, [x0], #16 - subs w3, w3, #4 - b.gt 1b + subs w3, w3, #4 + b.gt 1b ret endfunc diff --git a/libswresample/aarch64/resample.S b/libswresample/aarch64/resample.S index 114d1216fb..6d9eaaeb23 100644 --- a/libswresample/aarch64/resample.S +++ b/libswresample/aarch64/resample.S @@ -21,57 +21,57 @@ #include "libavutil/aarch64/asm.S" function ff_resample_common_apply_filter_x4_float_neon, export=1 - movi v0.4s, #0 // accumulator -1: ld1 {v1.4s}, [x1], #16 // src[0..3] - ld1 {v2.4s}, [x2], #16 // filter[0..3] - fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3] - subs w3, w3, #4 // filter_length -= 4 - b.gt 1b // loop until filter_length - faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values - faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values - st1 {v0.s}[0], [x0], #4 // write accumulator + movi v0.4s, #0 // accumulator +1: ld1 {v1.4s}, [x1], #16 // src[0..3] + ld1 {v2.4s}, [x2], #16 // filter[0..3] + fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3] + subs w3, w3, #4 // filter_length -= 4 + b.gt 1b // loop until filter_length + faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values + faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values + st1 {v0.s}[0], [x0], #4 // write accumulator ret endfunc function ff_resample_common_apply_filter_x8_float_neon, export=1 - movi v0.4s, #0 // accumulator -1: ld1 {v1.4s}, [x1], #16 // src[0..3] - ld1 {v2.4s}, [x2], #16 // filter[0..3] - ld1 {v3.4s}, [x1], #16 // src[4..7] - ld1 {v4.4s}, [x2], #16 // filter[4..7] - fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3] - fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7] - subs w3, w3, #8 // filter_length -= 8 - b.gt 1b // loop until filter_length - faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values - faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values - st1 {v0.s}[0], [x0], #4 // write accumulator + movi v0.4s, #0 // accumulator +1: ld1 {v1.4s}, [x1], #16 // src[0..3] + ld1 {v2.4s}, [x2], #16 // filter[0..3] + ld1 {v3.4s}, [x1], #16 // src[4..7] + ld1 {v4.4s}, [x2], #16 // filter[4..7] + fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3] + fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7] + subs w3, w3, #8 // filter_length -= 8 + b.gt 1b // loop until filter_length + faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values + faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values + st1 {v0.s}[0], [x0], #4 // write accumulator ret endfunc function ff_resample_common_apply_filter_x4_s16_neon, export=1 - movi v0.4s, #0 // accumulator -1: ld1 {v1.4h}, [x1], #8 // src[0..3] - ld1 {v2.4h}, [x2], #8 // filter[0..3] - smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3] - subs w3, w3, #4 // filter_length -= 4 - b.gt 1b // loop until filter_length - addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values - addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values - st1 {v0.s}[0], [x0], #4 // write accumulator + movi v0.4s, #0 // accumulator +1: ld1 {v1.4h}, [x1], #8 // src[0..3] + ld1 {v2.4h}, [x2], #8 // filter[0..3] + smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3] + subs w3, w3, #4 // filter_length -= 4 + b.gt 1b // loop until filter_length + addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values + addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values + st1 {v0.s}[0], [x0], #4 // write accumulator ret endfunc function ff_resample_common_apply_filter_x8_s16_neon, export=1 - movi v0.4s, #0 // accumulator -1: ld1 {v1.8h}, [x1], #16 // src[0..7] - ld1 {v2.8h}, [x2], #16 // filter[0..7] - smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3] - smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7] - subs w3, w3, #8 // filter_length -= 8 - b.gt 1b // loop until filter_length - addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values - addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values - st1 {v0.s}[0], [x0], #4 // write accumulator + movi v0.4s, #0 // accumulator +1: ld1 {v1.8h}, [x1], #16 // src[0..7] + ld1 {v2.8h}, [x2], #16 // filter[0..7] + smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3] + smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7] + subs w3, w3, #8 // filter_length -= 8 + b.gt 1b // loop until filter_length + addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values + addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values + st1 {v0.s}[0], [x0], #4 // write accumulator ret endfunc diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S index 3041d483fc..b49443c964 100644 --- a/libswscale/aarch64/hscale.S +++ b/libswscale/aarch64/hscale.S @@ -41,53 +41,53 @@ ;----------------------------------------------------------------------------- */ function ff_hscale8to15_X8_neon, export=1 - sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) -1: ldr w8, [x5], #4 // filterPos[idx] - ldr w0, [x5], #4 // filterPos[idx + 1] - ldr w11, [x5], #4 // filterPos[idx + 2] - ldr w9, [x5], #4 // filterPos[idx + 3] - mov x16, x4 // filter0 = filter - add x12, x16, x7 // filter1 = filter0 + filterSize*2 - add x13, x12, x7 // filter2 = filter1 + filterSize*2 - add x4, x13, x7 // filter3 = filter2 + filterSize*2 - movi v0.2d, #0 // val sum part 1 (for dst[0]) - movi v1.2d, #0 // val sum part 2 (for dst[1]) - movi v2.2d, #0 // val sum part 3 (for dst[2]) - movi v3.2d, #0 // val sum part 4 (for dst[3]) - add x17, x3, w8, uxtw // srcp + filterPos[0] - add x8, x3, w0, uxtw // srcp + filterPos[1] - add x0, x3, w11, uxtw // srcp + filterPos[2] - add x11, x3, w9, uxtw // srcp + filterPos[3] - mov w15, w6 // filterSize counter -2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}] - ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 - ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}] - ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize - uxtl v4.8h, v4.8b // unpack part 1 to 16-bit - smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] - smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] - ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}] - ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize - uxtl v6.8h, v6.8b // unpack part 2 to 16-bit - smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] - uxtl v16.8h, v16.8b // unpack part 3 to 16-bit - smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] - smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] - ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}] - smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] - ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize - subs w15, w15, #8 // j -= 8: processed 8/filterSize - uxtl v18.8h, v18.8b // unpack part 4 to 16-bit - smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] - smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] - b.gt 2b // inner loop if filterSize not consumed completely - addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding - addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding - addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding - subs w2, w2, #4 // dstW -= 4 - sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values - st1 {v0.4h}, [x1], #8 // write to destination part0123 - b.gt 1b // loop until end of line + sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) +1: ldr w8, [x5], #4 // filterPos[idx] + ldr w0, [x5], #4 // filterPos[idx + 1] + ldr w11, [x5], #4 // filterPos[idx + 2] + ldr w9, [x5], #4 // filterPos[idx + 3] + mov x16, x4 // filter0 = filter + add x12, x16, x7 // filter1 = filter0 + filterSize*2 + add x13, x12, x7 // filter2 = filter1 + filterSize*2 + add x4, x13, x7 // filter3 = filter2 + filterSize*2 + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v1.2d, #0 // val sum part 2 (for dst[1]) + movi v2.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) + add x17, x3, w8, uxtw // srcp + filterPos[0] + add x8, x3, w0, uxtw // srcp + filterPos[1] + add x0, x3, w11, uxtw // srcp + filterPos[2] + add x11, x3, w9, uxtw // srcp + filterPos[3] + mov w15, w6 // filterSize counter +2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}] + ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 + ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}] + ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize + uxtl v4.8h, v4.8b // unpack part 1 to 16-bit + smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] + smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] + ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}] + ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize + uxtl v6.8h, v6.8b // unpack part 2 to 16-bit + smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + uxtl v16.8h, v16.8b // unpack part 3 to 16-bit + smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}] + smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize + subs w15, w15, #8 // j -= 8: processed 8/filterSize + uxtl v18.8h, v18.8b // unpack part 4 to 16-bit + smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] + smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + b.gt 2b // inner loop if filterSize not consumed completely + addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding + addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding + addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding + subs w2, w2, #4 // dstW -= 4 + sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values + st1 {v0.4h}, [x1], #8 // write to destination part0123 + b.gt 1b // loop until end of line ret endfunc @@ -103,98 +103,98 @@ function ff_hscale8to15_X4_neon, export=1 // This function for filter sizes that are 4 mod 8. In other words, anything that's 0 mod 4 but not // 0 mod 8. It also assumes that dstW is 0 mod 4. - lsl w7, w6, #1 // w7 = filterSize * 2 + lsl w7, w6, #1 // w7 = filterSize * 2 1: - ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1] - ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3] + ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1] + ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3] - movi v16.2d, #0 // initialize accumulator for idx + 0 - movi v17.2d, #0 // initialize accumulator for idx + 1 - movi v18.2d, #0 // initialize accumulator for idx + 2 - movi v19.2d, #0 // initialize accumulator for idx + 3 + movi v16.2d, #0 // initialize accumulator for idx + 0 + movi v17.2d, #0 // initialize accumulator for idx + 1 + movi v18.2d, #0 // initialize accumulator for idx + 2 + movi v19.2d, #0 // initialize accumulator for idx + 3 - mov x12, x4 // filter pointer for idx + 0 - add x13, x4, x7 // filter pointer for idx + 1 - add x8, x3, w8, uxtw // srcp + filterPos[idx + 0] - add x9, x3, w9, uxtw // srcp + filterPos[idx + 1] + mov x12, x4 // filter pointer for idx + 0 + add x13, x4, x7 // filter pointer for idx + 1 + add x8, x3, w8, uxtw // srcp + filterPos[idx + 0] + add x9, x3, w9, uxtw // srcp + filterPos[idx + 1] - add x14, x13, x7 // filter pointer for idx + 2 - add x10, x3, w10, uxtw // srcp + filterPos[idx + 2] - add x11, x3, w11, uxtw // srcp + filterPos[idx + 3] + add x14, x13, x7 // filter pointer for idx + 2 + add x10, x3, w10, uxtw // srcp + filterPos[idx + 2] + add x11, x3, w11, uxtw // srcp + filterPos[idx + 3] - mov w0, w6 // copy filterSize to a temp register, w0 - add x5, x5, #16 // advance the filterPos pointer - add x15, x14, x7 // filter pointer for idx + 3 - mov x16, xzr // temp register for offsetting filter pointers + mov w0, w6 // copy filterSize to a temp register, w0 + add x5, x5, #16 // advance the filterPos pointer + add x15, x14, x7 // filter pointer for idx + 3 + mov x16, xzr // temp register for offsetting filter pointers 2: // This section loops over 8-wide chunks of filter size - ldr d4, [x8], #8 // load 8 bytes from srcp for idx + 0 - ldr q0, [x12, x16] // load 8 values, 16 bytes from filter for idx + 0 + ldr d4, [x8], #8 // load 8 bytes from srcp for idx + 0 + ldr q0, [x12, x16] // load 8 values, 16 bytes from filter for idx + 0 - ldr d5, [x9], #8 // load 8 bytes from srcp for idx + 1 - ldr q1, [x13, x16] // load 8 values, 16 bytes from filter for idx + 1 + ldr d5, [x9], #8 // load 8 bytes from srcp for idx + 1 + ldr q1, [x13, x16] // load 8 values, 16 bytes from filter for idx + 1 - uxtl v4.8h, v4.8b // unsigned extend long for idx + 0 - uxtl v5.8h, v5.8b // unsigned extend long for idx + 1 + uxtl v4.8h, v4.8b // unsigned extend long for idx + 0 + uxtl v5.8h, v5.8b // unsigned extend long for idx + 1 - ldr d6, [x10], #8 // load 8 bytes from srcp for idx + 2 - ldr q2, [x14, x16] // load 8 values, 16 bytes from filter for idx + 2 + ldr d6, [x10], #8 // load 8 bytes from srcp for idx + 2 + ldr q2, [x14, x16] // load 8 values, 16 bytes from filter for idx + 2 - smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0 - smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1 + smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0 + smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1 - ldr d7, [x11], #8 // load 8 bytes from srcp for idx + 3 - ldr q3, [x15, x16] // load 8 values, 16 bytes from filter for idx + 3 + ldr d7, [x11], #8 // load 8 bytes from srcp for idx + 3 + ldr q3, [x15, x16] // load 8 values, 16 bytes from filter for idx + 3 - sub w0, w0, #8 // decrement the remaining filterSize counter - smlal2 v16.4s, v0.8h, v4.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0 - smlal2 v17.4s, v1.8h, v5.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1 - uxtl v6.8h, v6.8b // unsigned extend long for idx + 2 - uxtl v7.8h, v7.8b // unsigned extend long for idx + 3 - smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2 - smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3 + sub w0, w0, #8 // decrement the remaining filterSize counter + smlal2 v16.4s, v0.8h, v4.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0 + smlal2 v17.4s, v1.8h, v5.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1 + uxtl v6.8h, v6.8b // unsigned extend long for idx + 2 + uxtl v7.8h, v7.8b // unsigned extend long for idx + 3 + smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2 + smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3 - cmp w0, #8 // are there at least 8 more elements in filter to consume? - add x16, x16, #16 // advance the offsetting register for filter values + cmp w0, #8 // are there at least 8 more elements in filter to consume? + add x16, x16, #16 // advance the offsetting register for filter values - smlal2 v18.4s, v2.8h, v6.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2 - smlal2 v19.4s, v3.8h, v7.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3 + smlal2 v18.4s, v2.8h, v6.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2 + smlal2 v19.4s, v3.8h, v7.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3 - b.ge 2b // branch back to inner loop + b.ge 2b // branch back to inner loop // complete the remaining 4 filter elements - sub x17, x7, #8 // calculate the offset of the filter pointer for the remaining 4 elements - - ldr s4, [x8] // load 4 bytes from srcp for idx + 0 - ldr d0, [x12, x17] // load 4 values, 8 bytes from filter for idx + 0 - ldr s5, [x9] // load 4 bytes from srcp for idx + 1 - ldr d1, [x13, x17] // load 4 values, 8 bytes from filter for idx + 1 - - uxtl v4.8h, v4.8b // unsigned extend long for idx + 0 - uxtl v5.8h, v5.8b // unsigned extend long for idx + 1 - - ldr s6, [x10] // load 4 bytes from srcp for idx + 2 - ldr d2, [x14, x17] // load 4 values, 8 bytes from filter for idx + 2 - smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0 - smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1 - ldr s7, [x11] // load 4 bytes from srcp for idx + 3 - ldr d3, [x15, x17] // load 4 values, 8 bytes from filter for idx + 3 - - uxtl v6.8h, v6.8b // unsigned extend long for idx + 2 - uxtl v7.8h, v7.8b // unsigned extend long for idx + 3 - addp v16.4s, v16.4s, v17.4s // horizontal pair adding for idx 0,1 - smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2 - smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3 - - addp v18.4s, v18.4s, v19.4s // horizontal pair adding for idx 2,3 - addp v16.4s, v16.4s, v18.4s // final horizontal pair adding producing one vector with results for idx = 0..3 - - subs w2, w2, #4 // dstW -= 4 - sqshrn v0.4h, v16.4s, #7 // shift and clip the 2x16-bit final values - st1 {v0.4h}, [x1], #8 // write to destination idx 0..3 - add x4, x4, x7, lsl #2 // filter += (filterSize*2) * 4 - b.gt 1b // loop until end of line + sub x17, x7, #8 // calculate the offset of the filter pointer for the remaining 4 elements + + ldr s4, [x8] // load 4 bytes from srcp for idx + 0 + ldr d0, [x12, x17] // load 4 values, 8 bytes from filter for idx + 0 + ldr s5, [x9] // load 4 bytes from srcp for idx + 1 + ldr d1, [x13, x17] // load 4 values, 8 bytes from filter for idx + 1 + + uxtl v4.8h, v4.8b // unsigned extend long for idx + 0 + uxtl v5.8h, v5.8b // unsigned extend long for idx + 1 + + ldr s6, [x10] // load 4 bytes from srcp for idx + 2 + ldr d2, [x14, x17] // load 4 values, 8 bytes from filter for idx + 2 + smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0 + smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1 + ldr s7, [x11] // load 4 bytes from srcp for idx + 3 + ldr d3, [x15, x17] // load 4 values, 8 bytes from filter for idx + 3 + + uxtl v6.8h, v6.8b // unsigned extend long for idx + 2 + uxtl v7.8h, v7.8b // unsigned extend long for idx + 3 + addp v16.4s, v16.4s, v17.4s // horizontal pair adding for idx 0,1 + smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2 + smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3 + + addp v18.4s, v18.4s, v19.4s // horizontal pair adding for idx 2,3 + addp v16.4s, v16.4s, v18.4s // final horizontal pair adding producing one vector with results for idx = 0..3 + + subs w2, w2, #4 // dstW -= 4 + sqshrn v0.4h, v16.4s, #7 // shift and clip the 2x16-bit final values + st1 {v0.4h}, [x1], #8 // write to destination idx 0..3 + add x4, x4, x7, lsl #2 // filter += (filterSize*2) * 4 + b.gt 1b // loop until end of line ret endfunc @@ -219,132 +219,132 @@ function ff_hscale8to15_4_neon, export=1 // 3. Complete madd // 4. Complete remaining iterations when dstW % 8 != 0 - sub sp, sp, #32 // allocate 32 bytes on the stack - cmp w2, #16 // if dstW <16, skip to the last block used for wrapping up - b.lt 2f + sub sp, sp, #32 // allocate 32 bytes on the stack + cmp w2, #16 // if dstW <16, skip to the last block used for wrapping up + b.lt 2f // load 8 values from filterPos to be used as offsets into src - ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1] - ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3] - ldp w12, w13, [x5, #16] // filterPos[idx + 4], [idx + 5] - ldp w14, w15, [x5, #24] // filterPos[idx + 6], [idx + 7] - add x5, x5, #32 // advance filterPos + ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1] + ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3] + ldp w12, w13, [x5, #16] // filterPos[idx + 4], [idx + 5] + ldp w14, w15, [x5, #24] // filterPos[idx + 6], [idx + 7] + add x5, x5, #32 // advance filterPos // gather random access data from src into contiguous memory - ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]][0..3] - ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]][0..3] - ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]][0..3] - ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]][0..3] - ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]][0..3] - ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]][0..3] - ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]][0..3] - ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]][0..3] - stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] } - stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] } - stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] } - stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] } + ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]][0..3] + ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]][0..3] + ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]][0..3] + ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]][0..3] + ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]][0..3] + ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]][0..3] + ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]][0..3] + ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]][0..3] + stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] } + stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] } + stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] } + stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] } 1: - ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers + ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers // load 8 values from filterPos to be used as offsets into src - ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration - ldp w10, w11, [x5, #8] // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration - ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration - ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration + ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration + ldp w10, w11, [x5, #8] // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration + ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration + ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration - movi v0.2d, #0 // Clear madd accumulator for idx 0..3 - movi v5.2d, #0 // Clear madd accumulator for idx 4..7 + movi v0.2d, #0 // Clear madd accumulator for idx 0..3 + movi v5.2d, #0 // Clear madd accumulator for idx 4..7 - ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 - add x5, x5, #32 // advance filterPos + add x5, x5, #32 // advance filterPos // interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy - uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit - uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit - ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]], next iteration - ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]], next iteration - uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit - uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit - ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]], next iteration - ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]], next iteration - - smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3 - smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3 - ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]], next iteration - ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]], next iteration - smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3 - smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3 - ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]], next iteration - ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]], next iteration - - smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7 - smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7 - stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] } - stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] } - smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7 - smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7 - stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] } - stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] } - - sub w2, w2, #8 // dstW -= 8 - sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values - sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values - st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7] - cmp w2, #16 // continue on main loop if there are at least 16 iterations left - b.ge 1b + uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit + uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit + ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]], next iteration + ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]], next iteration + uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit + uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit + ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]], next iteration + ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]], next iteration + + smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3 + smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3 + ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]], next iteration + ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]], next iteration + smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3 + smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3 + ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]], next iteration + ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]], next iteration + + smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7 + smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7 + stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] } + stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] } + smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7 + smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7 + stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] } + stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] } + + sub w2, w2, #8 // dstW -= 8 + sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values + sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values + st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7] + cmp w2, #16 // continue on main loop if there are at least 16 iterations left + b.ge 1b // last full iteration - ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] - ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 + ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7 - movi v0.2d, #0 // Clear madd accumulator for idx 0..3 - movi v5.2d, #0 // Clear madd accumulator for idx 4..7 + movi v0.2d, #0 // Clear madd accumulator for idx 0..3 + movi v5.2d, #0 // Clear madd accumulator for idx 4..7 - uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit - uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit - uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit - uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit + uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit + uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit + uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit + uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit - smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3 - smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3 - smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3 - smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3 + smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3 + smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3 + smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3 + smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3 - smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7 - smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7 - smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7 - smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7 + smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7 + smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7 + smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7 + smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7 - subs w2, w2, #8 // dstW -= 8 - sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values - sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values - st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7] + subs w2, w2, #8 // dstW -= 8 + sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values + sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values + st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7] - cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section + cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section - add sp, sp, #32 // clean up stack + add sp, sp, #32 // clean up stack ret // finish up when dstW % 8 != 0 or dstW < 16 2: // load src - ldr w8, [x5], #4 // filterPos[i] - add x9, x3, w8, uxtw // calculate the address for src load - ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3] + ldr w8, [x5], #4 // filterPos[i] + add x9, x3, w8, uxtw // calculate the address for src load + ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3] // load filter - ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3] + ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3] - uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit - smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...] - addv s0, v0.4s // add up products of src and filter values - sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value - st1 {v0.h}[0], [x1], #2 // dst[i] = ... - sub w2, w2, #1 // dstW-- - cbnz w2, 2b + uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit + smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...] + addv s0, v0.4s // add up products of src and filter values + sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value + st1 {v0.h}[0], [x1], #2 // dst[i] = ... + sub w2, w2, #1 // dstW-- + cbnz w2, 2b - add sp, sp, #32 // clean up stack + add sp, sp, #32 // clean up stack ret endfunc @@ -357,187 +357,187 @@ function ff_hscale8to19_4_neon, export=1 // x5 const int32_t *filterPos // w6 int filterSize - movi v18.4s, #1 - movi v17.4s, #1 - shl v18.4s, v18.4s, #19 - sub v18.4s, v18.4s, v17.4s // max allowed value + movi v18.4s, #1 + movi v17.4s, #1 + shl v18.4s, v18.4s, #19 + sub v18.4s, v18.4s, v17.4s // max allowed value - cmp w2, #16 - b.lt 2f // move to last block + cmp w2, #16 + b.lt 2f // move to last block - ldp w8, w9, [x5] // filterPos[0], filterPos[1] - ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] - ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] - ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] - add x5, x5, #32 + ldp w8, w9, [x5] // filterPos[0], filterPos[1] + ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] + ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] + ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] + add x5, x5, #32 // load data from - ldr w8, [x3, w8, uxtw] - ldr w9, [x3, w9, uxtw] - ldr w10, [x3, w10, uxtw] - ldr w11, [x3, w11, uxtw] - ldr w12, [x3, w12, uxtw] - ldr w13, [x3, w13, uxtw] - ldr w14, [x3, w14, uxtw] - ldr w15, [x3, w15, uxtw] - - sub sp, sp, #32 - - stp w8, w9, [sp] - stp w10, w11, [sp, #8] - stp w12, w13, [sp, #16] - stp w14, w15, [sp, #24] + ldr w8, [x3, w8, uxtw] + ldr w9, [x3, w9, uxtw] + ldr w10, [x3, w10, uxtw] + ldr w11, [x3, w11, uxtw] + ldr w12, [x3, w12, uxtw] + ldr w13, [x3, w13, uxtw] + ldr w14, [x3, w14, uxtw] + ldr w15, [x3, w15, uxtw] + + sub sp, sp, #32 + + stp w8, w9, [sp] + stp w10, w11, [sp, #8] + stp w12, w13, [sp, #16] + stp w14, w15, [sp, #24] 1: - ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] + ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp] + ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] // load filterPositions into registers for next iteration - ldp w8, w9, [x5] // filterPos[0], filterPos[1] - ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] - ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] - ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] - add x5, x5, #32 - uxtl v0.8h, v0.8b - ldr w8, [x3, w8, uxtw] - smull v5.4s, v0.4h, v28.4h // multiply first column of src - ldr w9, [x3, w9, uxtw] - smull2 v6.4s, v0.8h, v28.8h - stp w8, w9, [sp] - - uxtl v1.8h, v1.8b - ldr w10, [x3, w10, uxtw] - smlal v5.4s, v1.4h, v29.4h // multiply second column of src - ldr w11, [x3, w11, uxtw] - smlal2 v6.4s, v1.8h, v29.8h - stp w10, w11, [sp, #8] - - uxtl v2.8h, v2.8b - ldr w12, [x3, w12, uxtw] - smlal v5.4s, v2.4h, v30.4h // multiply third column of src - ldr w13, [x3, w13, uxtw] - smlal2 v6.4s, v2.8h, v30.8h - stp w12, w13, [sp, #16] - - uxtl v3.8h, v3.8b - ldr w14, [x3, w14, uxtw] - smlal v5.4s, v3.4h, v31.4h // multiply fourth column of src - ldr w15, [x3, w15, uxtw] - smlal2 v6.4s, v3.8h, v31.8h - stp w14, w15, [sp, #24] - - sub w2, w2, #8 - sshr v5.4s, v5.4s, #3 - sshr v6.4s, v6.4s, #3 - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - - st1 {v5.4s, v6.4s}, [x1], #32 - cmp w2, #16 - b.ge 1b + ldp w8, w9, [x5] // filterPos[0], filterPos[1] + ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] + ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] + ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] + add x5, x5, #32 + uxtl v0.8h, v0.8b + ldr w8, [x3, w8, uxtw] + smull v5.4s, v0.4h, v28.4h // multiply first column of src + ldr w9, [x3, w9, uxtw] + smull2 v6.4s, v0.8h, v28.8h + stp w8, w9, [sp] + + uxtl v1.8h, v1.8b + ldr w10, [x3, w10, uxtw] + smlal v5.4s, v1.4h, v29.4h // multiply second column of src + ldr w11, [x3, w11, uxtw] + smlal2 v6.4s, v1.8h, v29.8h + stp w10, w11, [sp, #8] + + uxtl v2.8h, v2.8b + ldr w12, [x3, w12, uxtw] + smlal v5.4s, v2.4h, v30.4h // multiply third column of src + ldr w13, [x3, w13, uxtw] + smlal2 v6.4s, v2.8h, v30.8h + stp w12, w13, [sp, #16] + + uxtl v3.8h, v3.8b + ldr w14, [x3, w14, uxtw] + smlal v5.4s, v3.4h, v31.4h // multiply fourth column of src + ldr w15, [x3, w15, uxtw] + smlal2 v6.4s, v3.8h, v31.8h + stp w14, w15, [sp, #24] + + sub w2, w2, #8 + sshr v5.4s, v5.4s, #3 + sshr v6.4s, v6.4s, #3 + smin v5.4s, v5.4s, v18.4s + smin v6.4s, v6.4s, v18.4s + + st1 {v5.4s, v6.4s}, [x1], #32 + cmp w2, #16 + b.ge 1b // here we make last iteration, without updating the registers - ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] - - uxtl v0.8h, v0.8b - uxtl v1.8h, v1.8b - smull v5.4s, v0.4h, v28.4h - smull2 v6.4s, v0.8h, v28.8h - uxtl v2.8h, v2.8b - smlal v5.4s, v1.4h, v29.4h - smlal2 v6.4s, v1.8h, v29.8h - uxtl v3.8h, v3.8b - smlal v5.4s, v2.4h, v30.4h - smlal2 v6.4s, v2.8h, v30.8h - smlal v5.4s, v3.4h, v31.4h - smlal2 v6.4s, v3.8h, v31.8h - - sshr v5.4s, v5.4s, #3 - sshr v6.4s, v6.4s, #3 - - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - - sub w2, w2, #8 - st1 {v5.4s, v6.4s}, [x1], #32 - add sp, sp, #32 // restore stack - cbnz w2, 2f + ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp] + ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + smull v5.4s, v0.4h, v28.4h + smull2 v6.4s, v0.8h, v28.8h + uxtl v2.8h, v2.8b + smlal v5.4s, v1.4h, v29.4h + smlal2 v6.4s, v1.8h, v29.8h + uxtl v3.8h, v3.8b + smlal v5.4s, v2.4h, v30.4h + smlal2 v6.4s, v2.8h, v30.8h + smlal v5.4s, v3.4h, v31.4h + smlal2 v6.4s, v3.8h, v31.8h + + sshr v5.4s, v5.4s, #3 + sshr v6.4s, v6.4s, #3 + + smin v5.4s, v5.4s, v18.4s + smin v6.4s, v6.4s, v18.4s + + sub w2, w2, #8 + st1 {v5.4s, v6.4s}, [x1], #32 + add sp, sp, #32 // restore stack + cbnz w2, 2f ret 2: - ldr w8, [x5], #4 // load filterPos - add x9, x3, w8, uxtw // src + filterPos - ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single - ld1 {v31.4h}, [x4], #8 - uxtl v0.8h, v0.8b - smull v5.4s, v0.4h, v31.4h - saddlv d0, v5.4s - sqshrn s0, d0, #3 - smin v0.4s, v0.4s, v18.4s - st1 {v0.s}[0], [x1], #4 - sub w2, w2, #1 - cbnz w2, 2b // if iterations remain jump to beginning + ldr w8, [x5], #4 // load filterPos + add x9, x3, w8, uxtw // src + filterPos + ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single + ld1 {v31.4h}, [x4], #8 + uxtl v0.8h, v0.8b + smull v5.4s, v0.4h, v31.4h + saddlv d0, v5.4s + sqshrn s0, d0, #3 + smin v0.4s, v0.4s, v18.4s + st1 {v0.s}[0], [x1], #4 + sub w2, w2, #1 + cbnz w2, 2b // if iterations remain jump to beginning ret endfunc function ff_hscale8to19_X8_neon, export=1 - movi v20.4s, #1 - movi v17.4s, #1 - shl v20.4s, v20.4s, #19 - sub v20.4s, v20.4s, v17.4s + movi v20.4s, #1 + movi v17.4s, #1 + shl v20.4s, v20.4s, #19 + sub v20.4s, v20.4s, v17.4s - sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) + sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) 1: - mov x16, x4 // filter0 = filter - ldr w8, [x5], #4 // filterPos[idx] - add x12, x16, x7 // filter1 = filter0 + filterSize*2 - ldr w0, [x5], #4 // filterPos[idx + 1] - add x13, x12, x7 // filter2 = filter1 + filterSize*2 - ldr w11, [x5], #4 // filterPos[idx + 2] - add x4, x13, x7 // filter3 = filter2 + filterSize*2 - ldr w9, [x5], #4 // filterPos[idx + 3] - movi v0.2d, #0 // val sum part 1 (for dst[0]) - movi v1.2d, #0 // val sum part 2 (for dst[1]) - movi v2.2d, #0 // val sum part 3 (for dst[2]) - movi v3.2d, #0 // val sum part 4 (for dst[3]) - add x17, x3, w8, uxtw // srcp + filterPos[0] - add x8, x3, w0, uxtw // srcp + filterPos[1] - add x0, x3, w11, uxtw // srcp + filterPos[2] - add x11, x3, w9, uxtw // srcp + filterPos[3] - mov w15, w6 // filterSize counter -2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}] - ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 - uxtl v4.8h, v4.8b // unpack part 1 to 16-bit - smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] - ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}] - smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] - ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize - ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}] - uxtl v6.8h, v6.8b // unpack part 2 to 16-bit - ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize - uxtl v16.8h, v16.8b // unpack part 3 to 16-bit - smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] - ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}] - smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] - ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize - smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] - uxtl v18.8h, v18.8b // unpack part 4 to 16-bit - smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] - smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] - subs w15, w15, #8 // j -= 8: processed 8/filterSize - smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] - b.gt 2b // inner loop if filterSize not consumed completely - addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding - addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding - addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding - subs w2, w2, #4 // dstW -= 4 - sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values - smin v0.4s, v0.4s, v20.4s - st1 {v0.4s}, [x1], #16 // write to destination part0123 - b.gt 1b // loop until end of line + mov x16, x4 // filter0 = filter + ldr w8, [x5], #4 // filterPos[idx] + add x12, x16, x7 // filter1 = filter0 + filterSize*2 + ldr w0, [x5], #4 // filterPos[idx + 1] + add x13, x12, x7 // filter2 = filter1 + filterSize*2 + ldr w11, [x5], #4 // filterPos[idx + 2] + add x4, x13, x7 // filter3 = filter2 + filterSize*2 + ldr w9, [x5], #4 // filterPos[idx + 3] + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v1.2d, #0 // val sum part 2 (for dst[1]) + movi v2.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) + add x17, x3, w8, uxtw // srcp + filterPos[0] + add x8, x3, w0, uxtw // srcp + filterPos[1] + add x0, x3, w11, uxtw // srcp + filterPos[2] + add x11, x3, w9, uxtw // srcp + filterPos[3] + mov w15, w6 // filterSize counter +2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}] + ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 + uxtl v4.8h, v4.8b // unpack part 1 to 16-bit + smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] + ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}] + smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] + ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize + ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}] + uxtl v6.8h, v6.8b // unpack part 2 to 16-bit + ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize + uxtl v16.8h, v16.8b // unpack part 3 to 16-bit + smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}] + smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize + smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + uxtl v18.8h, v18.8b // unpack part 4 to 16-bit + smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] + subs w15, w15, #8 // j -= 8: processed 8/filterSize + smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + b.gt 2b // inner loop if filterSize not consumed completely + addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding + addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding + addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding + subs w2, w2, #4 // dstW -= 4 + sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values + smin v0.4s, v0.4s, v20.4s + st1 {v0.4s}, [x1], #16 // write to destination part0123 + b.gt 1b // loop until end of line ret endfunc @@ -550,91 +550,91 @@ function ff_hscale8to19_X4_neon, export=1 // x5 const int32_t *filterPos // w6 int filterSize - movi v20.4s, #1 - movi v17.4s, #1 - shl v20.4s, v20.4s, #19 - sub v20.4s, v20.4s, v17.4s + movi v20.4s, #1 + movi v17.4s, #1 + shl v20.4s, v20.4s, #19 + sub v20.4s, v20.4s, v17.4s - lsl w7, w6, #1 + lsl w7, w6, #1 1: - ldp w8, w9, [x5] - ldp w10, w11, [x5, #8] - - movi v16.2d, #0 // initialize accumulator for idx + 0 - movi v17.2d, #0 // initialize accumulator for idx + 1 - movi v18.2d, #0 // initialize accumulator for idx + 2 - movi v19.2d, #0 // initialize accumulator for idx + 3 - - mov x12, x4 // filter + 0 - add x13, x4, x7 // filter + 1 - add x8, x3, w8, uxtw // srcp + filterPos 0 - add x14, x13, x7 // filter + 2 - add x9, x3, w9, uxtw // srcp + filterPos 1 - add x15, x14, x7 // filter + 3 - add x10, x3, w10, uxtw // srcp + filterPos 2 - mov w0, w6 // save the filterSize to temporary variable - add x11, x3, w11, uxtw // srcp + filterPos 3 - add x5, x5, #16 // advance filter position - mov x16, xzr // clear the register x16 used for offsetting the filter values + ldp w8, w9, [x5] + ldp w10, w11, [x5, #8] + + movi v16.2d, #0 // initialize accumulator for idx + 0 + movi v17.2d, #0 // initialize accumulator for idx + 1 + movi v18.2d, #0 // initialize accumulator for idx + 2 + movi v19.2d, #0 // initialize accumulator for idx + 3 + + mov x12, x4 // filter + 0 + add x13, x4, x7 // filter + 1 + add x8, x3, w8, uxtw // srcp + filterPos 0 + add x14, x13, x7 // filter + 2 + add x9, x3, w9, uxtw // srcp + filterPos 1 + add x15, x14, x7 // filter + 3 + add x10, x3, w10, uxtw // srcp + filterPos 2 + mov w0, w6 // save the filterSize to temporary variable + add x11, x3, w11, uxtw // srcp + filterPos 3 + add x5, x5, #16 // advance filter position + mov x16, xzr // clear the register x16 used for offsetting the filter values 2: - ldr d4, [x8], #8 // load src values for idx 0 - ldr q31, [x12, x16] // load filter values for idx 0 - uxtl v4.8h, v4.8b // extend type to match the filter' size - ldr d5, [x9], #8 // load src values for idx 1 - smlal v16.4s, v4.4h, v31.4h // multiplication of lower half for idx 0 - uxtl v5.8h, v5.8b // extend type to match the filter' size - ldr q30, [x13, x16] // load filter values for idx 1 - smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0 - ldr d6, [x10], #8 // load src values for idx 2 - ldr q29, [x14, x16] // load filter values for idx 2 - smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1 - ldr d7, [x11], #8 // load src values for idx 3 - smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1 - uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size - ldr q28, [x15, x16] // load filter values for idx 3 - smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2 - uxtl v7.8h, v7.8b - smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2 - sub w0, w0, #8 - smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3 - cmp w0, #8 - smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3 - add x16, x16, #16 // advance filter values indexing - - b.ge 2b + ldr d4, [x8], #8 // load src values for idx 0 + ldr q31, [x12, x16] // load filter values for idx 0 + uxtl v4.8h, v4.8b // extend type to match the filter' size + ldr d5, [x9], #8 // load src values for idx 1 + smlal v16.4s, v4.4h, v31.4h // multiplication of lower half for idx 0 + uxtl v5.8h, v5.8b // extend type to match the filter' size + ldr q30, [x13, x16] // load filter values for idx 1 + smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0 + ldr d6, [x10], #8 // load src values for idx 2 + ldr q29, [x14, x16] // load filter values for idx 2 + smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1 + ldr d7, [x11], #8 // load src values for idx 3 + smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1 + uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size + ldr q28, [x15, x16] // load filter values for idx 3 + smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2 + uxtl v7.8h, v7.8b + smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2 + sub w0, w0, #8 + smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3 + cmp w0, #8 + smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3 + add x16, x16, #16 // advance filter values indexing + + b.ge 2b // 4 iterations left - sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements - - ldr s4, [x8] // load src values for idx 0 - ldr d31, [x12, x17] // load filter values for idx 0 - uxtl v4.8h, v4.8b // extend type to match the filter' size - ldr s5, [x9] // load src values for idx 1 - smlal v16.4s, v4.4h, v31.4h - ldr d30, [x13, x17] // load filter values for idx 1 - uxtl v5.8h, v5.8b // extend type to match the filter' size - ldr s6, [x10] // load src values for idx 2 - smlal v17.4s, v5.4h, v30.4h - uxtl v6.8h, v6.8b // extend type to match the filter's size - ldr d29, [x14, x17] // load filter values for idx 2 - ldr s7, [x11] // load src values for idx 3 - addp v16.4s, v16.4s, v17.4s - uxtl v7.8h, v7.8b - ldr d28, [x15, x17] // load filter values for idx 3 - smlal v18.4s, v6.4h, v29.4h - smlal v19.4s, v7.4h, v28.4h - subs w2, w2, #4 - addp v18.4s, v18.4s, v19.4s - addp v16.4s, v16.4s, v18.4s - sshr v16.4s, v16.4s, #3 - smin v16.4s, v16.4s, v20.4s - - st1 {v16.4s}, [x1], #16 - add x4, x4, x7, lsl #2 - b.gt 1b + sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements + + ldr s4, [x8] // load src values for idx 0 + ldr d31, [x12, x17] // load filter values for idx 0 + uxtl v4.8h, v4.8b // extend type to match the filter' size + ldr s5, [x9] // load src values for idx 1 + smlal v16.4s, v4.4h, v31.4h + ldr d30, [x13, x17] // load filter values for idx 1 + uxtl v5.8h, v5.8b // extend type to match the filter' size + ldr s6, [x10] // load src values for idx 2 + smlal v17.4s, v5.4h, v30.4h + uxtl v6.8h, v6.8b // extend type to match the filter's size + ldr d29, [x14, x17] // load filter values for idx 2 + ldr s7, [x11] // load src values for idx 3 + addp v16.4s, v16.4s, v17.4s + uxtl v7.8h, v7.8b + ldr d28, [x15, x17] // load filter values for idx 3 + smlal v18.4s, v6.4h, v29.4h + smlal v19.4s, v7.4h, v28.4h + subs w2, w2, #4 + addp v18.4s, v18.4s, v19.4s + addp v16.4s, v16.4s, v18.4s + sshr v16.4s, v16.4s, #3 + smin v16.4s, v16.4s, v20.4s + + st1 {v16.4s}, [x1], #16 + add x4, x4, x7, lsl #2 + b.gt 1b ret endfunc @@ -647,191 +647,191 @@ function ff_hscale16to15_4_neon_asm, export=1 // x5 const int32_t *filterPos // w6 int filterSize - movi v18.4s, #1 - movi v17.4s, #1 - shl v18.4s, v18.4s, #15 - sub v18.4s, v18.4s, v17.4s // max allowed value - dup v17.4s, w0 // read shift - neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right) + movi v18.4s, #1 + movi v17.4s, #1 + shl v18.4s, v18.4s, #15 + sub v18.4s, v18.4s, v17.4s // max allowed value + dup v17.4s, w0 // read shift + neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right) - cmp w2, #16 - b.lt 2f // move to last block + cmp w2, #16 + b.lt 2f // move to last block - ldp w8, w9, [x5] // filterPos[0], filterPos[1] - ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] - ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] - ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] - add x5, x5, #32 + ldp w8, w9, [x5] // filterPos[0], filterPos[1] + ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] + ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] + ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] + add x5, x5, #32 // shift all filterPos left by one, as uint16_t will be read - lsl x8, x8, #1 - lsl x9, x9, #1 - lsl x10, x10, #1 - lsl x11, x11, #1 - lsl x12, x12, #1 - lsl x13, x13, #1 - lsl x14, x14, #1 - lsl x15, x15, #1 + lsl x8, x8, #1 + lsl x9, x9, #1 + lsl x10, x10, #1 + lsl x11, x11, #1 + lsl x12, x12, #1 + lsl x13, x13, #1 + lsl x14, x14, #1 + lsl x15, x15, #1 // load src with given offset - ldr x8, [x3, w8, uxtw] - ldr x9, [x3, w9, uxtw] - ldr x10, [x3, w10, uxtw] - ldr x11, [x3, w11, uxtw] - ldr x12, [x3, w12, uxtw] - ldr x13, [x3, w13, uxtw] - ldr x14, [x3, w14, uxtw] - ldr x15, [x3, w15, uxtw] - - sub sp, sp, #64 + ldr x8, [x3, w8, uxtw] + ldr x9, [x3, w9, uxtw] + ldr x10, [x3, w10, uxtw] + ldr x11, [x3, w11, uxtw] + ldr x12, [x3, w12, uxtw] + ldr x13, [x3, w13, uxtw] + ldr x14, [x3, w14, uxtw] + ldr x15, [x3, w15, uxtw] + + sub sp, sp, #64 // push src on stack so it can be loaded into vectors later - stp x8, x9, [sp] - stp x10, x11, [sp, #16] - stp x12, x13, [sp, #32] - stp x14, x15, [sp, #48] + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + stp x12, x13, [sp, #32] + stp x14, x15, [sp, #48] 1: - ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] + ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] // Each of blocks does the following: // Extend src and filter to 32 bits with uxtl and sxtl // multiply or multiply and accumulate results // Extending to 32 bits is necessary, as unit16_t values can't // be represented as int16_t without type promotion. - uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4h - uxtl2 v0.4s, v0.8h - mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8h - uxtl v26.4s, v1.4h - mul v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v29.4h - uxtl2 v0.4s, v1.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v29.8h - uxtl v26.4s, v2.4h - mla v6.4s, v28.4s, v0.4s - - sxtl v27.4s, v30.4h - uxtl2 v0.4s, v2.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v30.8h - uxtl v26.4s, v3.4h - mla v6.4s, v28.4s, v0.4s - - sxtl v27.4s, v31.4h - uxtl2 v0.4s, v3.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v31.8h - sub w2, w2, #8 - mla v6.4s, v28.4s, v0.4s - - sshl v5.4s, v5.4s, v17.4s - sshl v6.4s, v6.4s, v17.4s - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - xtn v5.4h, v5.4s - xtn2 v5.8h, v6.4s - - st1 {v5.8h}, [x1], #16 - cmp w2, #16 + uxtl v26.4s, v0.4h + sxtl v27.4s, v28.4h + uxtl2 v0.4s, v0.8h + mul v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v28.8h + uxtl v26.4s, v1.4h + mul v6.4s, v0.4s, v28.4s + + sxtl v27.4s, v29.4h + uxtl2 v0.4s, v1.8h + mla v5.4s, v27.4s, v26.4s + sxtl2 v28.4s, v29.8h + uxtl v26.4s, v2.4h + mla v6.4s, v28.4s, v0.4s + + sxtl v27.4s, v30.4h + uxtl2 v0.4s, v2.8h + mla v5.4s, v27.4s, v26.4s + sxtl2 v28.4s, v30.8h + uxtl v26.4s, v3.4h + mla v6.4s, v28.4s, v0.4s + + sxtl v27.4s, v31.4h + uxtl2 v0.4s, v3.8h + mla v5.4s, v27.4s, v26.4s + sxtl2 v28.4s, v31.8h + sub w2, w2, #8 + mla v6.4s, v28.4s, v0.4s + + sshl v5.4s, v5.4s, v17.4s + sshl v6.4s, v6.4s, v17.4s + smin v5.4s, v5.4s, v18.4s + smin v6.4s, v6.4s, v18.4s + xtn v5.4h, v5.4s + xtn2 v5.8h, v6.4s + + st1 {v5.8h}, [x1], #16 + cmp w2, #16 // load filterPositions into registers for next iteration - ldp w8, w9, [x5] // filterPos[0], filterPos[1] - ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] - ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] - ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] - add x5, x5, #32 - - lsl x8, x8, #1 - lsl x9, x9, #1 - lsl x10, x10, #1 - lsl x11, x11, #1 - lsl x12, x12, #1 - lsl x13, x13, #1 - lsl x14, x14, #1 - lsl x15, x15, #1 - - ldr x8, [x3, w8, uxtw] - ldr x9, [x3, w9, uxtw] - ldr x10, [x3, w10, uxtw] - ldr x11, [x3, w11, uxtw] - ldr x12, [x3, w12, uxtw] - ldr x13, [x3, w13, uxtw] - ldr x14, [x3, w14, uxtw] - ldr x15, [x3, w15, uxtw] - - stp x8, x9, [sp] - stp x10, x11, [sp, #16] - stp x12, x13, [sp, #32] - stp x14, x15, [sp, #48] - - b.ge 1b + ldp w8, w9, [x5] // filterPos[0], filterPos[1] + ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] + ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] + ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] + add x5, x5, #32 + + lsl x8, x8, #1 + lsl x9, x9, #1 + lsl x10, x10, #1 + lsl x11, x11, #1 + lsl x12, x12, #1 + lsl x13, x13, #1 + lsl x14, x14, #1 + lsl x15, x15, #1 + + ldr x8, [x3, w8, uxtw] + ldr x9, [x3, w9, uxtw] + ldr x10, [x3, w10, uxtw] + ldr x11, [x3, w11, uxtw] + ldr x12, [x3, w12, uxtw] + ldr x13, [x3, w13, uxtw] + ldr x14, [x3, w14, uxtw] + ldr x15, [x3, w15, uxtw] + + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + stp x12, x13, [sp, #32] + stp x14, x15, [sp, #48] + + b.ge 1b // here we make last iteration, without updating the registers - ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 - - uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4h - uxtl2 v0.4s, v0.8h - mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8h - uxtl v26.4s, v1.4h - mul v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v29.4h - uxtl2 v0.4s, v1.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v29.8h - uxtl v26.4s, v2.4h - mla v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v30.4h - uxtl2 v0.4s, v2.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v30.8h - uxtl v26.4s, v3.4h - mla v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v31.4h - uxtl2 v0.4s, v3.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v31.8h - subs w2, w2, #8 - mla v6.4s, v0.4s, v28.4s - - sshl v5.4s, v5.4s, v17.4s - sshl v6.4s, v6.4s, v17.4s - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - xtn v5.4h, v5.4s - xtn2 v5.8h, v6.4s - - st1 {v5.8h}, [x1], #16 - add sp, sp, #64 // restore stack - cbnz w2, 2f + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] + ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 + + uxtl v26.4s, v0.4h + sxtl v27.4s, v28.4h + uxtl2 v0.4s, v0.8h + mul v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v28.8h + uxtl v26.4s, v1.4h + mul v6.4s, v0.4s, v28.4s + + sxtl v27.4s, v29.4h + uxtl2 v0.4s, v1.8h + mla v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v29.8h + uxtl v26.4s, v2.4h + mla v6.4s, v0.4s, v28.4s + + sxtl v27.4s, v30.4h + uxtl2 v0.4s, v2.8h + mla v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v30.8h + uxtl v26.4s, v3.4h + mla v6.4s, v0.4s, v28.4s + + sxtl v27.4s, v31.4h + uxtl2 v0.4s, v3.8h + mla v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v31.8h + subs w2, w2, #8 + mla v6.4s, v0.4s, v28.4s + + sshl v5.4s, v5.4s, v17.4s + sshl v6.4s, v6.4s, v17.4s + smin v5.4s, v5.4s, v18.4s + smin v6.4s, v6.4s, v18.4s + xtn v5.4h, v5.4s + xtn2 v5.8h, v6.4s + + st1 {v5.8h}, [x1], #16 + add sp, sp, #64 // restore stack + cbnz w2, 2f ret 2: - ldr w8, [x5], #4 // load filterPos - lsl w8, w8, #1 - add x9, x3, w8, uxtw // src + filterPos - ld1 {v0.4h}, [x9] // load 4 * uint16_t - ld1 {v31.4h}, [x4], #8 - - uxtl v0.4s, v0.4h - sxtl v31.4s, v31.4h - mul v5.4s, v0.4s, v31.4s - addv s0, v5.4s - sshl v0.4s, v0.4s, v17.4s - smin v0.4s, v0.4s, v18.4s - st1 {v0.h}[0], [x1], #2 - sub w2, w2, #1 - cbnz w2, 2b // if iterations remain jump to beginning + ldr w8, [x5], #4 // load filterPos + lsl w8, w8, #1 + add x9, x3, w8, uxtw // src + filterPos + ld1 {v0.4h}, [x9] // load 4 * uint16_t + ld1 {v31.4h}, [x4], #8 + + uxtl v0.4s, v0.4h + sxtl v31.4s, v31.4h + mul v5.4s, v0.4s, v31.4s + addv s0, v5.4s + sshl v0.4s, v0.4s, v17.4s + smin v0.4s, v0.4s, v18.4s + st1 {v0.h}[0], [x1], #2 + sub w2, w2, #1 + cbnz w2, 2b // if iterations remain jump to beginning ret endfunc @@ -845,79 +845,79 @@ function ff_hscale16to15_X8_neon_asm, export=1 // x5 const int32_t *filterPos // w6 int filterSize - movi v20.4s, #1 - movi v21.4s, #1 - shl v20.4s, v20.4s, #15 - sub v20.4s, v20.4s, v21.4s - dup v21.4s, w0 - neg v21.4s, v21.4s - - sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) -1: ldr w8, [x5], #4 // filterPos[idx] - lsl w8, w8, #1 - ldr w10, [x5], #4 // filterPos[idx + 1] - lsl w10, w10, #1 - ldr w11, [x5], #4 // filterPos[idx + 2] - lsl w11, w11, #1 - ldr w9, [x5], #4 // filterPos[idx + 3] - lsl w9, w9, #1 - mov x16, x4 // filter0 = filter - add x12, x16, x7 // filter1 = filter0 + filterSize*2 - add x13, x12, x7 // filter2 = filter1 + filterSize*2 - add x4, x13, x7 // filter3 = filter2 + filterSize*2 - movi v0.2d, #0 // val sum part 1 (for dst[0]) - movi v1.2d, #0 // val sum part 2 (for dst[1]) - movi v2.2d, #0 // val sum part 3 (for dst[2]) - movi v3.2d, #0 // val sum part 4 (for dst[3]) - add x17, x3, w8, uxtw // srcp + filterPos[0] - add x8, x3, w10, uxtw // srcp + filterPos[1] - add x10, x3, w11, uxtw // srcp + filterPos[2] - add x11, x3, w9, uxtw // srcp + filterPos[3] - mov w15, w6 // filterSize counter -2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}] - ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 - ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}] - ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize - uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign - sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size - uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits - mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5 - sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits - uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits - mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 - sxtl v27.4s, v7.4h // exted filter lower half - uxtl2 v6.4s, v6.8h // extend srcp upper half - sxtl2 v7.4s, v7.8h // extend filter upper half - ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}] - mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] - ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize - uxtl v22.4s, v16.4h // extend srcp lower half - sxtl v23.4s, v17.4h // extend filter lower half - uxtl2 v16.4s, v16.8h // extend srcp upper half - sxtl2 v17.4s, v17.8h // extend filter upper half - mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] - mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] - ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}] - mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] - ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize - subs w15, w15, #8 // j -= 8: processed 8/filterSize - uxtl v28.4s, v18.4h // extend srcp lower half - sxtl v29.4s, v19.4h // extend filter lower half - uxtl2 v18.4s, v18.8h // extend srcp upper half - sxtl2 v19.4s, v19.8h // extend filter upper half - mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] - mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] - b.gt 2b // inner loop if filterSize not consumed completely - addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding - addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding - addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding - subs w2, w2, #4 // dstW -= 4 - sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected - smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl) - xtn v0.4h, v0.4s // narrow down to 16 bits - - st1 {v0.4h}, [x1], #8 // write to destination part0123 - b.gt 1b // loop until end of line + movi v20.4s, #1 + movi v21.4s, #1 + shl v20.4s, v20.4s, #15 + sub v20.4s, v20.4s, v21.4s + dup v21.4s, w0 + neg v21.4s, v21.4s + + sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) +1: ldr w8, [x5], #4 // filterPos[idx] + lsl w8, w8, #1 + ldr w10, [x5], #4 // filterPos[idx + 1] + lsl w10, w10, #1 + ldr w11, [x5], #4 // filterPos[idx + 2] + lsl w11, w11, #1 + ldr w9, [x5], #4 // filterPos[idx + 3] + lsl w9, w9, #1 + mov x16, x4 // filter0 = filter + add x12, x16, x7 // filter1 = filter0 + filterSize*2 + add x13, x12, x7 // filter2 = filter1 + filterSize*2 + add x4, x13, x7 // filter3 = filter2 + filterSize*2 + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v1.2d, #0 // val sum part 2 (for dst[1]) + movi v2.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) + add x17, x3, w8, uxtw // srcp + filterPos[0] + add x8, x3, w10, uxtw // srcp + filterPos[1] + add x10, x3, w11, uxtw // srcp + filterPos[2] + add x11, x3, w9, uxtw // srcp + filterPos[3] + mov w15, w6 // filterSize counter +2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}] + ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 + ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}] + ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize + uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign + sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size + uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits + mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5 + sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits + uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits + mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 + sxtl v27.4s, v7.4h // exted filter lower half + uxtl2 v6.4s, v6.8h // extend srcp upper half + sxtl2 v7.4s, v7.8h // extend filter upper half + ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}] + mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize + uxtl v22.4s, v16.4h // extend srcp lower half + sxtl v23.4s, v17.4h // extend filter lower half + uxtl2 v16.4s, v16.8h // extend srcp upper half + sxtl2 v17.4s, v17.8h // extend filter upper half + mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}] + mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize + subs w15, w15, #8 // j -= 8: processed 8/filterSize + uxtl v28.4s, v18.4h // extend srcp lower half + sxtl v29.4s, v19.4h // extend filter lower half + uxtl2 v18.4s, v18.8h // extend srcp upper half + sxtl2 v19.4s, v19.8h // extend filter upper half + mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] + mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + b.gt 2b // inner loop if filterSize not consumed completely + addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding + addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding + addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding + subs w2, w2, #4 // dstW -= 4 + sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected + smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl) + xtn v0.4h, v0.4s // narrow down to 16 bits + + st1 {v0.4h}, [x1], #8 // write to destination part0123 + b.gt 1b // loop until end of line ret endfunc @@ -930,118 +930,118 @@ function ff_hscale16to15_X4_neon_asm, export=1 // x5 const int32_t *filterPos // w6 int filterSize - stp d8, d9, [sp, #-0x20]! - stp d10, d11, [sp, #0x10] + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] - movi v18.4s, #1 - movi v17.4s, #1 - shl v18.4s, v18.4s, #15 - sub v21.4s, v18.4s, v17.4s // max allowed value - dup v17.4s, w0 // read shift - neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right) + movi v18.4s, #1 + movi v17.4s, #1 + shl v18.4s, v18.4s, #15 + sub v21.4s, v18.4s, v17.4s // max allowed value + dup v17.4s, w0 // read shift + neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right) - lsl w7, w6, #1 + lsl w7, w6, #1 1: - ldp w8, w9, [x5] - ldp w10, w11, [x5, #8] - - movi v16.2d, #0 // initialize accumulator for idx + 0 - movi v17.2d, #0 // initialize accumulator for idx + 1 - movi v18.2d, #0 // initialize accumulator for idx + 2 - movi v19.2d, #0 // initialize accumulator for idx + 3 - - mov x12, x4 // filter + 0 - add x13, x4, x7 // filter + 1 - add x8, x3, x8, lsl #1 // srcp + filterPos 0 - add x14, x13, x7 // filter + 2 - add x9, x3, x9, lsl #1 // srcp + filterPos 1 - add x15, x14, x7 // filter + 3 - add x10, x3, x10, lsl #1 // srcp + filterPos 2 - mov w0, w6 // save the filterSize to temporary variable - add x11, x3, x11, lsl #1 // srcp + filterPos 3 - add x5, x5, #16 // advance filter position - mov x16, xzr // clear the register x16 used for offsetting the filter values + ldp w8, w9, [x5] + ldp w10, w11, [x5, #8] + + movi v16.2d, #0 // initialize accumulator for idx + 0 + movi v17.2d, #0 // initialize accumulator for idx + 1 + movi v18.2d, #0 // initialize accumulator for idx + 2 + movi v19.2d, #0 // initialize accumulator for idx + 3 + + mov x12, x4 // filter + 0 + add x13, x4, x7 // filter + 1 + add x8, x3, x8, lsl #1 // srcp + filterPos 0 + add x14, x13, x7 // filter + 2 + add x9, x3, x9, lsl #1 // srcp + filterPos 1 + add x15, x14, x7 // filter + 3 + add x10, x3, x10, lsl #1 // srcp + filterPos 2 + mov w0, w6 // save the filterSize to temporary variable + add x11, x3, x11, lsl #1 // srcp + filterPos 3 + add x5, x5, #16 // advance filter position + mov x16, xzr // clear the register x16 used for offsetting the filter values 2: - ldr q4, [x8], #16 // load src values for idx 0 - ldr q5, [x9], #16 // load src values for idx 1 - uxtl v26.4s, v4.4h - uxtl2 v4.4s, v4.8h - ldr q31, [x12, x16] // load filter values for idx 0 - ldr q6, [x10], #16 // load src values for idx 2 - sxtl v22.4s, v31.4h - sxtl2 v31.4s, v31.8h - mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0 - uxtl v25.4s, v5.4h - uxtl2 v5.4s, v5.8h - ldr q30, [x13, x16] // load filter values for idx 1 - ldr q7, [x11], #16 // load src values for idx 3 - mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0 - uxtl v24.4s, v6.4h - sxtl v8.4s, v30.4h - sxtl2 v30.4s, v30.8h - mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1 - ldr q29, [x14, x16] // load filter values for idx 2 - uxtl2 v6.4s, v6.8h - sxtl v9.4s, v29.4h - sxtl2 v29.4s, v29.8h - mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1 - mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2 - ldr q28, [x15, x16] // load filter values for idx 3 - uxtl v23.4s, v7.4h - sxtl v10.4s, v28.4h - mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2 - uxtl2 v7.4s, v7.8h - sxtl2 v28.4s, v28.8h - mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3 - sub w0, w0, #8 - cmp w0, #8 - mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3 - - add x16, x16, #16 // advance filter values indexing - - b.ge 2b + ldr q4, [x8], #16 // load src values for idx 0 + ldr q5, [x9], #16 // load src values for idx 1 + uxtl v26.4s, v4.4h + uxtl2 v4.4s, v4.8h + ldr q31, [x12, x16] // load filter values for idx 0 + ldr q6, [x10], #16 // load src values for idx 2 + sxtl v22.4s, v31.4h + sxtl2 v31.4s, v31.8h + mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0 + uxtl v25.4s, v5.4h + uxtl2 v5.4s, v5.8h + ldr q30, [x13, x16] // load filter values for idx 1 + ldr q7, [x11], #16 // load src values for idx 3 + mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0 + uxtl v24.4s, v6.4h + sxtl v8.4s, v30.4h + sxtl2 v30.4s, v30.8h + mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1 + ldr q29, [x14, x16] // load filter values for idx 2 + uxtl2 v6.4s, v6.8h + sxtl v9.4s, v29.4h + sxtl2 v29.4s, v29.8h + mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1 + mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2 + ldr q28, [x15, x16] // load filter values for idx 3 + uxtl v23.4s, v7.4h + sxtl v10.4s, v28.4h + mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2 + uxtl2 v7.4s, v7.8h + sxtl2 v28.4s, v28.8h + mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3 + sub w0, w0, #8 + cmp w0, #8 + mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3 + + add x16, x16, #16 // advance filter values indexing + + b.ge 2b // 4 iterations left - sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements - - ldr d4, [x8] // load src values for idx 0 - ldr d31, [x12, x17] // load filter values for idx 0 - uxtl v4.4s, v4.4h - sxtl v31.4s, v31.4h - ldr d5, [x9] // load src values for idx 1 - mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0 - ldr d30, [x13, x17] // load filter values for idx 1 - uxtl v5.4s, v5.4h - sxtl v30.4s, v30.4h - ldr d6, [x10] // load src values for idx 2 - mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1 - ldr d29, [x14, x17] // load filter values for idx 2 - uxtl v6.4s, v6.4h - sxtl v29.4s, v29.4h - ldr d7, [x11] // load src values for idx 3 - ldr d28, [x15, x17] // load filter values for idx 3 - mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2 - uxtl v7.4s, v7.4h - sxtl v28.4s, v28.4h - addp v16.4s, v16.4s, v17.4s - mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3 - subs w2, w2, #4 - addp v18.4s, v18.4s, v19.4s - addp v16.4s, v16.4s, v18.4s - sshl v16.4s, v16.4s, v20.4s - smin v16.4s, v16.4s, v21.4s - xtn v16.4h, v16.4s - - st1 {v16.4h}, [x1], #8 - add x4, x4, x7, lsl #2 - b.gt 1b - - ldp d8, d9, [sp] - ldp d10, d11, [sp, #0x10] - - add sp, sp, #0x20 + sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements + + ldr d4, [x8] // load src values for idx 0 + ldr d31, [x12, x17] // load filter values for idx 0 + uxtl v4.4s, v4.4h + sxtl v31.4s, v31.4h + ldr d5, [x9] // load src values for idx 1 + mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0 + ldr d30, [x13, x17] // load filter values for idx 1 + uxtl v5.4s, v5.4h + sxtl v30.4s, v30.4h + ldr d6, [x10] // load src values for idx 2 + mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1 + ldr d29, [x14, x17] // load filter values for idx 2 + uxtl v6.4s, v6.4h + sxtl v29.4s, v29.4h + ldr d7, [x11] // load src values for idx 3 + ldr d28, [x15, x17] // load filter values for idx 3 + mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2 + uxtl v7.4s, v7.4h + sxtl v28.4s, v28.4h + addp v16.4s, v16.4s, v17.4s + mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3 + subs w2, w2, #4 + addp v18.4s, v18.4s, v19.4s + addp v16.4s, v16.4s, v18.4s + sshl v16.4s, v16.4s, v20.4s + smin v16.4s, v16.4s, v21.4s + xtn v16.4h, v16.4s + + st1 {v16.4h}, [x1], #8 + add x4, x4, x7, lsl #2 + b.gt 1b + + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + + add sp, sp, #0x20 ret endfunc @@ -1055,188 +1055,188 @@ function ff_hscale16to19_4_neon_asm, export=1 // x5 const int32_t *filterPos // w6 int filterSize - movi v18.4s, #1 - movi v17.4s, #1 - shl v18.4s, v18.4s, #19 - sub v18.4s, v18.4s, v17.4s // max allowed value - dup v17.4s, w0 // read shift - neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right) + movi v18.4s, #1 + movi v17.4s, #1 + shl v18.4s, v18.4s, #19 + sub v18.4s, v18.4s, v17.4s // max allowed value + dup v17.4s, w0 // read shift + neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right) - cmp w2, #16 - b.lt 2f // move to last block + cmp w2, #16 + b.lt 2f // move to last block - ldp w8, w9, [x5] // filterPos[0], filterPos[1] - ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] - ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] - ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] - add x5, x5, #32 + ldp w8, w9, [x5] // filterPos[0], filterPos[1] + ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] + ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] + ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] + add x5, x5, #32 // shift all filterPos left by one, as uint16_t will be read - lsl x8, x8, #1 - lsl x9, x9, #1 - lsl x10, x10, #1 - lsl x11, x11, #1 - lsl x12, x12, #1 - lsl x13, x13, #1 - lsl x14, x14, #1 - lsl x15, x15, #1 + lsl x8, x8, #1 + lsl x9, x9, #1 + lsl x10, x10, #1 + lsl x11, x11, #1 + lsl x12, x12, #1 + lsl x13, x13, #1 + lsl x14, x14, #1 + lsl x15, x15, #1 // load src with given offset - ldr x8, [x3, w8, uxtw] - ldr x9, [x3, w9, uxtw] - ldr x10, [x3, w10, uxtw] - ldr x11, [x3, w11, uxtw] - ldr x12, [x3, w12, uxtw] - ldr x13, [x3, w13, uxtw] - ldr x14, [x3, w14, uxtw] - ldr x15, [x3, w15, uxtw] - - sub sp, sp, #64 + ldr x8, [x3, w8, uxtw] + ldr x9, [x3, w9, uxtw] + ldr x10, [x3, w10, uxtw] + ldr x11, [x3, w11, uxtw] + ldr x12, [x3, w12, uxtw] + ldr x13, [x3, w13, uxtw] + ldr x14, [x3, w14, uxtw] + ldr x15, [x3, w15, uxtw] + + sub sp, sp, #64 // push src on stack so it can be loaded into vectors later - stp x8, x9, [sp] - stp x10, x11, [sp, #16] - stp x12, x13, [sp, #32] - stp x14, x15, [sp, #48] + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + stp x12, x13, [sp, #32] + stp x14, x15, [sp, #48] 1: - ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] + ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] // Each of blocks does the following: // Extend src and filter to 32 bits with uxtl and sxtl // multiply or multiply and accumulate results // Extending to 32 bits is necessary, as unit16_t values can't // be represented as int16_t without type promotion. - uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4h - uxtl2 v0.4s, v0.8h - mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8h - uxtl v26.4s, v1.4h - mul v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v29.4h - uxtl2 v0.4s, v1.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v29.8h - uxtl v26.4s, v2.4h - mla v6.4s, v28.4s, v0.4s - - sxtl v27.4s, v30.4h - uxtl2 v0.4s, v2.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v30.8h - uxtl v26.4s, v3.4h - mla v6.4s, v28.4s, v0.4s - - sxtl v27.4s, v31.4h - uxtl2 v0.4s, v3.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v31.8h - sub w2, w2, #8 - mla v6.4s, v28.4s, v0.4s - - sshl v5.4s, v5.4s, v17.4s - sshl v6.4s, v6.4s, v17.4s - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - - st1 {v5.4s, v6.4s}, [x1], #32 - cmp w2, #16 + uxtl v26.4s, v0.4h + sxtl v27.4s, v28.4h + uxtl2 v0.4s, v0.8h + mul v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v28.8h + uxtl v26.4s, v1.4h + mul v6.4s, v0.4s, v28.4s + + sxtl v27.4s, v29.4h + uxtl2 v0.4s, v1.8h + mla v5.4s, v27.4s, v26.4s + sxtl2 v28.4s, v29.8h + uxtl v26.4s, v2.4h + mla v6.4s, v28.4s, v0.4s + + sxtl v27.4s, v30.4h + uxtl2 v0.4s, v2.8h + mla v5.4s, v27.4s, v26.4s + sxtl2 v28.4s, v30.8h + uxtl v26.4s, v3.4h + mla v6.4s, v28.4s, v0.4s + + sxtl v27.4s, v31.4h + uxtl2 v0.4s, v3.8h + mla v5.4s, v27.4s, v26.4s + sxtl2 v28.4s, v31.8h + sub w2, w2, #8 + mla v6.4s, v28.4s, v0.4s + + sshl v5.4s, v5.4s, v17.4s + sshl v6.4s, v6.4s, v17.4s + smin v5.4s, v5.4s, v18.4s + smin v6.4s, v6.4s, v18.4s + + st1 {v5.4s, v6.4s}, [x1], #32 + cmp w2, #16 // load filterPositions into registers for next iteration - ldp w8, w9, [x5] // filterPos[0], filterPos[1] - ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] - ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] - ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] - add x5, x5, #32 - - lsl x8, x8, #1 - lsl x9, x9, #1 - lsl x10, x10, #1 - lsl x11, x11, #1 - lsl x12, x12, #1 - lsl x13, x13, #1 - lsl x14, x14, #1 - lsl x15, x15, #1 - - ldr x8, [x3, w8, uxtw] - ldr x9, [x3, w9, uxtw] - ldr x10, [x3, w10, uxtw] - ldr x11, [x3, w11, uxtw] - ldr x12, [x3, w12, uxtw] - ldr x13, [x3, w13, uxtw] - ldr x14, [x3, w14, uxtw] - ldr x15, [x3, w15, uxtw] - - stp x8, x9, [sp] - stp x10, x11, [sp, #16] - stp x12, x13, [sp, #32] - stp x14, x15, [sp, #48] - - b.ge 1b + ldp w8, w9, [x5] // filterPos[0], filterPos[1] + ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] + ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] + ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] + add x5, x5, #32 + + lsl x8, x8, #1 + lsl x9, x9, #1 + lsl x10, x10, #1 + lsl x11, x11, #1 + lsl x12, x12, #1 + lsl x13, x13, #1 + lsl x14, x14, #1 + lsl x15, x15, #1 + + ldr x8, [x3, w8, uxtw] + ldr x9, [x3, w9, uxtw] + ldr x10, [x3, w10, uxtw] + ldr x11, [x3, w11, uxtw] + ldr x12, [x3, w12, uxtw] + ldr x13, [x3, w13, uxtw] + ldr x14, [x3, w14, uxtw] + ldr x15, [x3, w15, uxtw] + + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + stp x12, x13, [sp, #32] + stp x14, x15, [sp, #48] + + b.ge 1b // here we make last iteration, without updating the registers - ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 - - uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4h - uxtl2 v0.4s, v0.8h - mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8h - uxtl v26.4s, v1.4h - mul v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v29.4h - uxtl2 v0.4s, v1.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v29.8h - uxtl v26.4s, v2.4h - mla v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v30.4h - uxtl2 v0.4s, v2.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v30.8h - uxtl v26.4s, v3.4h - mla v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v31.4h - uxtl2 v0.4s, v3.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v31.8h - subs w2, w2, #8 - mla v6.4s, v0.4s, v28.4s - - sshl v5.4s, v5.4s, v17.4s - sshl v6.4s, v6.4s, v17.4s - - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - - st1 {v5.4s, v6.4s}, [x1], #32 - add sp, sp, #64 // restore stack - cbnz w2, 2f + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] + ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 + + uxtl v26.4s, v0.4h + sxtl v27.4s, v28.4h + uxtl2 v0.4s, v0.8h + mul v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v28.8h + uxtl v26.4s, v1.4h + mul v6.4s, v0.4s, v28.4s + + sxtl v27.4s, v29.4h + uxtl2 v0.4s, v1.8h + mla v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v29.8h + uxtl v26.4s, v2.4h + mla v6.4s, v0.4s, v28.4s + + sxtl v27.4s, v30.4h + uxtl2 v0.4s, v2.8h + mla v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v30.8h + uxtl v26.4s, v3.4h + mla v6.4s, v0.4s, v28.4s + + sxtl v27.4s, v31.4h + uxtl2 v0.4s, v3.8h + mla v5.4s, v26.4s, v27.4s + sxtl2 v28.4s, v31.8h + subs w2, w2, #8 + mla v6.4s, v0.4s, v28.4s + + sshl v5.4s, v5.4s, v17.4s + sshl v6.4s, v6.4s, v17.4s + + smin v5.4s, v5.4s, v18.4s + smin v6.4s, v6.4s, v18.4s + + st1 {v5.4s, v6.4s}, [x1], #32 + add sp, sp, #64 // restore stack + cbnz w2, 2f ret 2: - ldr w8, [x5], #4 // load filterPos - lsl w8, w8, #1 - add x9, x3, w8, uxtw // src + filterPos - ld1 {v0.4h}, [x9] // load 4 * uint16_t - ld1 {v31.4h}, [x4], #8 - - uxtl v0.4s, v0.4h - sxtl v31.4s, v31.4h - subs w2, w2, #1 - mul v5.4s, v0.4s, v31.4s - addv s0, v5.4s - sshl v0.4s, v0.4s, v17.4s - smin v0.4s, v0.4s, v18.4s - st1 {v0.s}[0], [x1], #4 - cbnz w2, 2b // if iterations remain jump to beginning + ldr w8, [x5], #4 // load filterPos + lsl w8, w8, #1 + add x9, x3, w8, uxtw // src + filterPos + ld1 {v0.4h}, [x9] // load 4 * uint16_t + ld1 {v31.4h}, [x4], #8 + + uxtl v0.4s, v0.4h + sxtl v31.4s, v31.4h + subs w2, w2, #1 + mul v5.4s, v0.4s, v31.4s + addv s0, v5.4s + sshl v0.4s, v0.4s, v17.4s + smin v0.4s, v0.4s, v18.4s + st1 {v0.s}[0], [x1], #4 + cbnz w2, 2b // if iterations remain jump to beginning ret endfunc @@ -1250,77 +1250,77 @@ function ff_hscale16to19_X8_neon_asm, export=1 // x5 const int32_t *filterPos // w6 int filterSize - movi v20.4s, #1 - movi v21.4s, #1 - shl v20.4s, v20.4s, #19 - sub v20.4s, v20.4s, v21.4s - dup v21.4s, w0 - neg v21.4s, v21.4s - - sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) -1: ldr w8, [x5], #4 // filterPos[idx] - ldr w10, [x5], #4 // filterPos[idx + 1] - lsl w8, w8, #1 - ldr w11, [x5], #4 // filterPos[idx + 2] - ldr w9, [x5], #4 // filterPos[idx + 3] - mov x16, x4 // filter0 = filter - lsl w11, w11, #1 - add x12, x16, x7 // filter1 = filter0 + filterSize*2 - lsl w9, w9, #1 - add x13, x12, x7 // filter2 = filter1 + filterSize*2 - lsl w10, w10, #1 - add x4, x13, x7 // filter3 = filter2 + filterSize*2 - movi v0.2d, #0 // val sum part 1 (for dst[0]) - movi v1.2d, #0 // val sum part 2 (for dst[1]) - movi v2.2d, #0 // val sum part 3 (for dst[2]) - movi v3.2d, #0 // val sum part 4 (for dst[3]) - add x17, x3, w8, uxtw // srcp + filterPos[0] - add x8, x3, w10, uxtw // srcp + filterPos[1] - add x10, x3, w11, uxtw // srcp + filterPos[2] - add x11, x3, w9, uxtw // srcp + filterPos[3] - mov w15, w6 // filterSize counter -2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}] - ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 - ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}] - ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize - uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign - sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size - uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits - mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5 - sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits - uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits - mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 - sxtl v27.4s, v7.4h // exted filter lower half - uxtl2 v6.4s, v6.8h // extend srcp upper half - sxtl2 v7.4s, v7.8h // extend filter upper half - ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}] - mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] - ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize - uxtl v22.4s, v16.4h // extend srcp lower half - sxtl v23.4s, v17.4h // extend filter lower half - uxtl2 v16.4s, v16.8h // extend srcp upper half - sxtl2 v17.4s, v17.8h // extend filter upper half - mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] - mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] - ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}] - mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] - ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize - subs w15, w15, #8 // j -= 8: processed 8/filterSize - uxtl v28.4s, v18.4h // extend srcp lower half - sxtl v29.4s, v19.4h // extend filter lower half - uxtl2 v18.4s, v18.8h // extend srcp upper half - sxtl2 v19.4s, v19.8h // extend filter upper half - mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] - mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] - b.gt 2b // inner loop if filterSize not consumed completely - addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding - addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding - addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding - subs w2, w2, #4 // dstW -= 4 - sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected - smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl) - st1 {v0.4s}, [x1], #16 // write to destination part0123 - b.gt 1b // loop until end of line + movi v20.4s, #1 + movi v21.4s, #1 + shl v20.4s, v20.4s, #19 + sub v20.4s, v20.4s, v21.4s + dup v21.4s, w0 + neg v21.4s, v21.4s + + sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) +1: ldr w8, [x5], #4 // filterPos[idx] + ldr w10, [x5], #4 // filterPos[idx + 1] + lsl w8, w8, #1 + ldr w11, [x5], #4 // filterPos[idx + 2] + ldr w9, [x5], #4 // filterPos[idx + 3] + mov x16, x4 // filter0 = filter + lsl w11, w11, #1 + add x12, x16, x7 // filter1 = filter0 + filterSize*2 + lsl w9, w9, #1 + add x13, x12, x7 // filter2 = filter1 + filterSize*2 + lsl w10, w10, #1 + add x4, x13, x7 // filter3 = filter2 + filterSize*2 + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v1.2d, #0 // val sum part 2 (for dst[1]) + movi v2.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) + add x17, x3, w8, uxtw // srcp + filterPos[0] + add x8, x3, w10, uxtw // srcp + filterPos[1] + add x10, x3, w11, uxtw // srcp + filterPos[2] + add x11, x3, w9, uxtw // srcp + filterPos[3] + mov w15, w6 // filterSize counter +2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}] + ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1 + ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}] + ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize + uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign + sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size + uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits + mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5 + sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits + uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits + mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 + sxtl v27.4s, v7.4h // exted filter lower half + uxtl2 v6.4s, v6.8h // extend srcp upper half + sxtl2 v7.4s, v7.8h // extend filter upper half + ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}] + mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize + uxtl v22.4s, v16.4h // extend srcp lower half + sxtl v23.4s, v17.4h // extend filter lower half + uxtl2 v16.4s, v16.8h // extend srcp upper half + sxtl2 v17.4s, v17.8h // extend filter upper half + mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}] + mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize + subs w15, w15, #8 // j -= 8: processed 8/filterSize + uxtl v28.4s, v18.4h // extend srcp lower half + sxtl v29.4s, v19.4h // extend filter lower half + uxtl2 v18.4s, v18.8h // extend srcp upper half + sxtl2 v19.4s, v19.8h // extend filter upper half + mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] + mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + b.gt 2b // inner loop if filterSize not consumed completely + addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding + addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding + addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding + subs w2, w2, #4 // dstW -= 4 + sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected + smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl) + st1 {v0.4s}, [x1], #16 // write to destination part0123 + b.gt 1b // loop until end of line ret endfunc @@ -1333,117 +1333,117 @@ function ff_hscale16to19_X4_neon_asm, export=1 // x5 const int32_t *filterPos // w6 int filterSize - stp d8, d9, [sp, #-0x20]! - stp d10, d11, [sp, #0x10] + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] - movi v18.4s, #1 - movi v17.4s, #1 - shl v18.4s, v18.4s, #19 - sub v21.4s, v18.4s, v17.4s // max allowed value - dup v17.4s, w0 // read shift - neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right) + movi v18.4s, #1 + movi v17.4s, #1 + shl v18.4s, v18.4s, #19 + sub v21.4s, v18.4s, v17.4s // max allowed value + dup v17.4s, w0 // read shift + neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right) - lsl w7, w6, #1 + lsl w7, w6, #1 1: - ldp w8, w9, [x5] - ldp w10, w11, [x5, #8] - - movi v16.2d, #0 // initialize accumulator for idx + 0 - movi v17.2d, #0 // initialize accumulator for idx + 1 - movi v18.2d, #0 // initialize accumulator for idx + 2 - movi v19.2d, #0 // initialize accumulator for idx + 3 - - mov x12, x4 // filter + 0 - add x13, x4, x7 // filter + 1 - add x8, x3, x8, lsl #1 // srcp + filterPos 0 - add x14, x13, x7 // filter + 2 - add x9, x3, x9, lsl #1 // srcp + filterPos 1 - add x15, x14, x7 // filter + 3 - add x10, x3, x10, lsl #1 // srcp + filterPos 2 - mov w0, w6 // save the filterSize to temporary variable - add x11, x3, x11, lsl #1 // srcp + filterPos 3 - add x5, x5, #16 // advance filter position - mov x16, xzr // clear the register x16 used for offsetting the filter values + ldp w8, w9, [x5] + ldp w10, w11, [x5, #8] + + movi v16.2d, #0 // initialize accumulator for idx + 0 + movi v17.2d, #0 // initialize accumulator for idx + 1 + movi v18.2d, #0 // initialize accumulator for idx + 2 + movi v19.2d, #0 // initialize accumulator for idx + 3 + + mov x12, x4 // filter + 0 + add x13, x4, x7 // filter + 1 + add x8, x3, x8, lsl #1 // srcp + filterPos 0 + add x14, x13, x7 // filter + 2 + add x9, x3, x9, lsl #1 // srcp + filterPos 1 + add x15, x14, x7 // filter + 3 + add x10, x3, x10, lsl #1 // srcp + filterPos 2 + mov w0, w6 // save the filterSize to temporary variable + add x11, x3, x11, lsl #1 // srcp + filterPos 3 + add x5, x5, #16 // advance filter position + mov x16, xzr // clear the register x16 used for offsetting the filter values 2: - ldr q4, [x8], #16 // load src values for idx 0 - ldr q5, [x9], #16 // load src values for idx 1 - uxtl v26.4s, v4.4h - uxtl2 v4.4s, v4.8h - ldr q31, [x12, x16] // load filter values for idx 0 - ldr q6, [x10], #16 // load src values for idx 2 - sxtl v22.4s, v31.4h - sxtl2 v31.4s, v31.8h - mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0 - uxtl v25.4s, v5.4h - uxtl2 v5.4s, v5.8h - ldr q30, [x13, x16] // load filter values for idx 1 - ldr q7, [x11], #16 // load src values for idx 3 - mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0 - uxtl v24.4s, v6.4h - sxtl v8.4s, v30.4h - sxtl2 v30.4s, v30.8h - mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1 - ldr q29, [x14, x16] // load filter values for idx 2 - uxtl2 v6.4s, v6.8h - sxtl v9.4s, v29.4h - sxtl2 v29.4s, v29.8h - mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1 - ldr q28, [x15, x16] // load filter values for idx 3 - mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2 - uxtl v23.4s, v7.4h - sxtl v10.4s, v28.4h - mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2 - uxtl2 v7.4s, v7.8h - sxtl2 v28.4s, v28.8h - mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3 - sub w0, w0, #8 - cmp w0, #8 - mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3 - - add x16, x16, #16 // advance filter values indexing - - b.ge 2b + ldr q4, [x8], #16 // load src values for idx 0 + ldr q5, [x9], #16 // load src values for idx 1 + uxtl v26.4s, v4.4h + uxtl2 v4.4s, v4.8h + ldr q31, [x12, x16] // load filter values for idx 0 + ldr q6, [x10], #16 // load src values for idx 2 + sxtl v22.4s, v31.4h + sxtl2 v31.4s, v31.8h + mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0 + uxtl v25.4s, v5.4h + uxtl2 v5.4s, v5.8h + ldr q30, [x13, x16] // load filter values for idx 1 + ldr q7, [x11], #16 // load src values for idx 3 + mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0 + uxtl v24.4s, v6.4h + sxtl v8.4s, v30.4h + sxtl2 v30.4s, v30.8h + mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1 + ldr q29, [x14, x16] // load filter values for idx 2 + uxtl2 v6.4s, v6.8h + sxtl v9.4s, v29.4h + sxtl2 v29.4s, v29.8h + mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1 + ldr q28, [x15, x16] // load filter values for idx 3 + mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2 + uxtl v23.4s, v7.4h + sxtl v10.4s, v28.4h + mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2 + uxtl2 v7.4s, v7.8h + sxtl2 v28.4s, v28.8h + mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3 + sub w0, w0, #8 + cmp w0, #8 + mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3 + + add x16, x16, #16 // advance filter values indexing + + b.ge 2b // 4 iterations left - sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements - - ldr d4, [x8] // load src values for idx 0 - ldr d31, [x12, x17] // load filter values for idx 0 - uxtl v4.4s, v4.4h - sxtl v31.4s, v31.4h - ldr d5, [x9] // load src values for idx 1 - mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0 - ldr d30, [x13, x17] // load filter values for idx 1 - uxtl v5.4s, v5.4h - sxtl v30.4s, v30.4h - ldr d6, [x10] // load src values for idx 2 - mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1 - ldr d29, [x14, x17] // load filter values for idx 2 - uxtl v6.4s, v6.4h - sxtl v29.4s, v29.4h - ldr d7, [x11] // load src values for idx 3 - ldr d28, [x15, x17] // load filter values for idx 3 - mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2 - uxtl v7.4s, v7.4h - sxtl v28.4s, v28.4h - addp v16.4s, v16.4s, v17.4s - mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3 - subs w2, w2, #4 - addp v18.4s, v18.4s, v19.4s - addp v16.4s, v16.4s, v18.4s - sshl v16.4s, v16.4s, v20.4s - smin v16.4s, v16.4s, v21.4s - - st1 {v16.4s}, [x1], #16 - add x4, x4, x7, lsl #2 - b.gt 1b - - ldp d8, d9, [sp] - ldp d10, d11, [sp, #0x10] - - add sp, sp, #0x20 + sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements + + ldr d4, [x8] // load src values for idx 0 + ldr d31, [x12, x17] // load filter values for idx 0 + uxtl v4.4s, v4.4h + sxtl v31.4s, v31.4h + ldr d5, [x9] // load src values for idx 1 + mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0 + ldr d30, [x13, x17] // load filter values for idx 1 + uxtl v5.4s, v5.4h + sxtl v30.4s, v30.4h + ldr d6, [x10] // load src values for idx 2 + mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1 + ldr d29, [x14, x17] // load filter values for idx 2 + uxtl v6.4s, v6.4h + sxtl v29.4s, v29.4h + ldr d7, [x11] // load src values for idx 3 + ldr d28, [x15, x17] // load filter values for idx 3 + mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2 + uxtl v7.4s, v7.4h + sxtl v28.4s, v28.4h + addp v16.4s, v16.4s, v17.4s + mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3 + subs w2, w2, #4 + addp v18.4s, v18.4s, v19.4s + addp v16.4s, v16.4s, v18.4s + sshl v16.4s, v16.4s, v20.4s + smin v16.4s, v16.4s, v21.4s + + st1 {v16.4s}, [x1], #16 + add x4, x4, x7, lsl #2 + b.gt 1b + + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + + add sp, sp, #0x20 ret endfunc diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S index 344d0659ea..934d62dfd0 100644 --- a/libswscale/aarch64/output.S +++ b/libswscale/aarch64/output.S @@ -29,178 +29,178 @@ function ff_yuv2planeX_8_neon, export=1 // x5 - const uint8_t *dither, // w6 - int offset - ld1 {v0.8b}, [x5] // load 8x8-bit dither - and w6, w6, #7 - cbz w6, 1f // check if offsetting present - ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only -1: uxtl v0.8h, v0.8b // extend dither to 16-bit - ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1) - ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2) - cmp w1, #8 // if filterSize == 8, branch to specialized version - b.eq 6f - cmp w1, #4 // if filterSize == 4, branch to specialized version - b.eq 8f - cmp w1, #2 // if filterSize == 2, branch to specialized version - b.eq 10f + ld1 {v0.8b}, [x5] // load 8x8-bit dither + and w6, w6, #7 + cbz w6, 1f // check if offsetting present + ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only +1: uxtl v0.8h, v0.8b // extend dither to 16-bit + ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1) + ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2) + cmp w1, #8 // if filterSize == 8, branch to specialized version + b.eq 6f + cmp w1, #4 // if filterSize == 4, branch to specialized version + b.eq 8f + cmp w1, #2 // if filterSize == 2, branch to specialized version + b.eq 10f // The filter size does not match of the of specialized implementations. It is either even or odd. If it is even // then use the first section below. - mov x7, #0 // i = 0 - tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version + mov x7, #0 // i = 0 + tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version // fs % 2 == 0 -2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value - mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value - mov w8, w1 // tmpfilterSize = filterSize - mov x9, x2 // srcp = src - mov x10, x0 // filterp = filter -3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1] - ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1] - add x11, x11, x7, lsl #1 // &src[j ][i] - add x12, x12, x7, lsl #1 // &src[j+1][i] - ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H - ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P - smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X - smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X - smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y - smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y - subs w8, w8, #2 // tmpfilterSize -= 2 - b.gt 3b // loop until filterSize consumed - - sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) - sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) - uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) - st1 {v3.8b}, [x3], #8 // write to destination - subs w4, w4, #8 // dstW -= 8 - add x7, x7, #8 // i += 8 - b.gt 2b // loop until width consumed +2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value + mov w8, w1 // tmpfilterSize = filterSize + mov x9, x2 // srcp = src + mov x10, x0 // filterp = filter +3: ldp x11, x12, [x9], #16 // get 2 pointers: src[j] and src[j+1] + ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1] + add x11, x11, x7, lsl #1 // &src[j ][i] + add x12, x12, x7, lsl #1 // &src[j+1][i] + ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H + ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P + smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X + smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X + smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y + smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y + subs w8, w8, #2 // tmpfilterSize -= 2 + b.gt 3b // loop until filterSize consumed + + sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) + sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) + uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) + st1 {v3.8b}, [x3], #8 // write to destination + subs w4, w4, #8 // dstW -= 8 + add x7, x7, #8 // i += 8 + b.gt 2b // loop until width consumed ret // If filter size is odd (most likely == 1), then use this section. // fs % 2 != 0 -4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value - mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value - mov w8, w1 // tmpfilterSize = filterSize - mov x9, x2 // srcp = src - mov x10, x0 // filterp = filter -5: ldr x11, [x9], #8 // get 1 pointer: src[j] - ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j] - add x11, x11, x7, lsl #1 // &src[j ][i] - ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H - smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X - smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X - subs w8, w8, #1 // tmpfilterSize -= 2 - b.gt 5b // loop until filterSize consumed - - sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) - sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) - uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) - st1 {v3.8b}, [x3], #8 // write to destination - subs w4, w4, #8 // dstW -= 8 - add x7, x7, #8 // i += 8 - b.gt 4b // loop until width consumed +4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value + mov w8, w1 // tmpfilterSize = filterSize + mov x9, x2 // srcp = src + mov x10, x0 // filterp = filter +5: ldr x11, [x9], #8 // get 1 pointer: src[j] + ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j] + add x11, x11, x7, lsl #1 // &src[j ][i] + ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H + smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X + smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X + subs w8, w8, #1 // tmpfilterSize -= 2 + b.gt 5b // loop until filterSize consumed + + sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) + sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) + uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) + st1 {v3.8b}, [x3], #8 // write to destination + subs w4, w4, #8 // dstW -= 8 + add x7, x7, #8 // i += 8 + b.gt 4b // loop until width consumed ret 6: // fs=8 - ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1] - ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3] - ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5] - ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7] + ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1] + ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3] + ldp x10, x11, [x2, #32] // load 2 pointers: src[j+4] and src[j+5] + ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7] // load 8x16-bit values for filter[j], where j=0..7 - ld1 {v6.8h}, [x0] + ld1 {v6.8h}, [x0] 7: - mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value - mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value - - ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] - ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] - ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] - ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] - ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}] - ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}] - ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}] - ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}] - - smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] - smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] - smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] - smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] - smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2] - smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2] - smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3] - smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3] - smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4] - smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4] - smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5] - smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5] - smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6] - smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6] - smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7] - smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7] - - sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) - sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) - uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) - subs w4, w4, #8 // dstW -= 8 - st1 {v3.8b}, [x3], #8 // write to destination - b.gt 7b // loop until width consumed + mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value + + ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] + ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] + ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] + ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] + ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}] + ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}] + ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}] + ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}] + + smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] + smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] + smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] + smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] + smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2] + smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2] + smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3] + smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3] + smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4] + smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4] + smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5] + smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5] + smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6] + smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6] + smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7] + smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7] + + sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) + sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) + uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) + subs w4, w4, #8 // dstW -= 8 + st1 {v3.8b}, [x3], #8 // write to destination + b.gt 7b // loop until width consumed ret 8: // fs=4 - ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1] - ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3] + ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1] + ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3] // load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes - ld1 {v6.4h}, [x0] + ld1 {v6.4h}, [x0] 9: - mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value - mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value - - ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] - ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] - ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] - ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] - - smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] - smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] - smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] - smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] - smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2] - smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2] - smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3] - smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3] - - sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) - sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) - uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) - st1 {v3.8b}, [x3], #8 // write to destination - subs w4, w4, #8 // dstW -= 8 - b.gt 9b // loop until width consumed + mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value + + ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] + ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] + ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] + ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] + + smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] + smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] + smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] + smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] + smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2] + smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2] + smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3] + smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3] + + sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) + sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) + uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) + st1 {v3.8b}, [x3], #8 // write to destination + subs w4, w4, #8 // dstW -= 8 + b.gt 9b // loop until width consumed ret 10: // fs=2 - ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1] + ldp x5, x6, [x2] // load 2 pointers: src[j ] and src[j+1] // load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes - ldr s6, [x0] + ldr s6, [x0] 11: - mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value - mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value - - ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] - ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] - - smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] - smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] - smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] - smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] - - sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) - sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) - uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) - st1 {v3.8b}, [x3], #8 // write to destination - subs w4, w4, #8 // dstW -= 8 - b.gt 11b // loop until width consumed + mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value + mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value + + ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] + ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] + + smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0] + smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0] + smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1] + smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1] + + sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) + sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) + uqshrn v3.8b, v3.8h, #3 // clip8(val>>19) + st1 {v3.8b}, [x3], #8 // write to destination + subs w4, w4, #8 // dstW -= 8 + b.gt 11b // loop until width consumed ret endfunc @@ -210,25 +210,25 @@ function ff_yuv2plane1_8_neon, export=1 // w2 - int dstW, // x3 - const uint8_t *dither, // w4 - int offset - ld1 {v0.8b}, [x3] // load 8x8-bit dither - and w4, w4, #7 - cbz w4, 1f // check if offsetting present - ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only -1: uxtl v0.8h, v0.8b // extend dither to 32-bit - uxtl v1.4s, v0.4h - uxtl2 v2.4s, v0.8h + ld1 {v0.8b}, [x3] // load 8x8-bit dither + and w4, w4, #7 + cbz w4, 1f // check if offsetting present + ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only +1: uxtl v0.8h, v0.8b // extend dither to 32-bit + uxtl v1.4s, v0.4h + uxtl2 v2.4s, v0.8h 2: - ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H - sxtl v4.4s, v3.4h - sxtl2 v5.4s, v3.8h - add v4.4s, v4.4s, v1.4s - add v5.4s, v5.4s, v2.4s - sqshrun v4.4h, v4.4s, #6 - sqshrun2 v4.8h, v5.4s, #6 - - uqshrn v3.8b, v4.8h, #1 // clip8(val>>7) - subs w2, w2, #8 // dstW -= 8 - st1 {v3.8b}, [x1], #8 // write to destination - b.gt 2b // loop until width consumed + ld1 {v3.8h}, [x0], #16 // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H + sxtl v4.4s, v3.4h + sxtl2 v5.4s, v3.8h + add v4.4s, v4.4s, v1.4s + add v5.4s, v5.4s, v2.4s + sqshrun v4.4h, v4.4s, #6 + sqshrun2 v4.8h, v5.4s, #6 + + uqshrn v3.8b, v4.8h, #1 // clip8(val>>7) + subs w2, w2, #8 // dstW -= 8 + st1 {v3.8b}, [x1], #8 // write to destination + b.gt 2b // loop until width consumed ret endfunc diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index 379d75622e..89d69e7f6c 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -23,23 +23,23 @@ .macro load_yoff_ycoeff yoff ycoeff #if defined(__APPLE__) - ldp w9, w10, [sp, #\yoff] + ldp w9, w10, [sp, #\yoff] #else - ldr w9, [sp, #\yoff] - ldr w10, [sp, #\ycoeff] + ldr w9, [sp, #\yoff] + ldr w10, [sp, #\ycoeff] #endif .endm .macro load_args_nv12 - ldr x8, [sp] // table - load_yoff_ycoeff 8, 16 // y_offset, y_coeff - ld1 {v1.1d}, [x8] - dup v0.8h, w10 - dup v3.8h, w9 - sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) - sub w5, w5, w0 // w5 = linesizeY - width (paddingY) - sub w7, w7, w0 // w7 = linesizeC - width (paddingC) - neg w11, w0 + ldr x8, [sp] // table + load_yoff_ycoeff 8, 16 // y_offset, y_coeff + ld1 {v1.1d}, [x8] + dup v0.8h, w10 + dup v3.8h, w9 + sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + sub w5, w5, w0 // w5 = linesizeY - width (paddingY) + sub w7, w7, w0 // w7 = linesizeC - width (paddingC) + neg w11, w0 .endm .macro load_args_nv21 @@ -47,52 +47,52 @@ .endm .macro load_args_yuv420p - ldr x13, [sp] // srcV - ldr w14, [sp, #8] // linesizeV - ldr x8, [sp, #16] // table - load_yoff_ycoeff 24, 32 // y_offset, y_coeff - ld1 {v1.1d}, [x8] - dup v0.8h, w10 - dup v3.8h, w9 - sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) - sub w5, w5, w0 // w5 = linesizeY - width (paddingY) - sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) - sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV) - lsr w11, w0, #1 - neg w11, w11 + ldr x13, [sp] // srcV + ldr w14, [sp, #8] // linesizeV + ldr x8, [sp, #16] // table + load_yoff_ycoeff 24, 32 // y_offset, y_coeff + ld1 {v1.1d}, [x8] + dup v0.8h, w10 + dup v3.8h, w9 + sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + sub w5, w5, w0 // w5 = linesizeY - width (paddingY) + sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) + sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV) + lsr w11, w0, #1 + neg w11, w11 .endm .macro load_args_yuv422p - ldr x13, [sp] // srcV - ldr w14, [sp, #8] // linesizeV - ldr x8, [sp, #16] // table - load_yoff_ycoeff 24, 32 // y_offset, y_coeff - ld1 {v1.1d}, [x8] - dup v0.8h, w10 - dup v3.8h, w9 - sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) - sub w5, w5, w0 // w5 = linesizeY - width (paddingY) - sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) - sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV) + ldr x13, [sp] // srcV + ldr w14, [sp, #8] // linesizeV + ldr x8, [sp, #16] // table + load_yoff_ycoeff 24, 32 // y_offset, y_coeff + ld1 {v1.1d}, [x8] + dup v0.8h, w10 + dup v3.8h, w9 + sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + sub w5, w5, w0 // w5 = linesizeY - width (paddingY) + sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) + sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV) .endm .macro load_chroma_nv12 - ld2 {v16.8b, v17.8b}, [x6], #16 - ushll v18.8h, v16.8b, #3 - ushll v19.8h, v17.8b, #3 + ld2 {v16.8b, v17.8b}, [x6], #16 + ushll v18.8h, v16.8b, #3 + ushll v19.8h, v17.8b, #3 .endm .macro load_chroma_nv21 - ld2 {v16.8b, v17.8b}, [x6], #16 - ushll v19.8h, v16.8b, #3 - ushll v18.8h, v17.8b, #3 + ld2 {v16.8b, v17.8b}, [x6], #16 + ushll v19.8h, v16.8b, #3 + ushll v18.8h, v17.8b, #3 .endm .macro load_chroma_yuv420p - ld1 {v16.8b}, [ x6], #8 - ld1 {v17.8b}, [x13], #8 - ushll v18.8h, v16.8b, #3 - ushll v19.8h, v17.8b, #3 + ld1 {v16.8b}, [ x6], #8 + ld1 {v17.8b}, [x13], #8 + ushll v18.8h, v16.8b, #3 + ushll v19.8h, v17.8b, #3 .endm .macro load_chroma_yuv422p @@ -100,9 +100,9 @@ .endm .macro increment_nv12 - ands w15, w1, #1 - csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width - add x6, x6, w16, sxtw // srcC += incC + ands w15, w1, #1 + csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width + add x6, x6, w16, sxtw // srcC += incC .endm .macro increment_nv21 @@ -110,100 +110,100 @@ .endm .macro increment_yuv420p - ands w15, w1, #1 - csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2 - csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2 - add x6, x6, w16, sxtw // srcU += incU - add x13, x13, w17, sxtw // srcV += incV + ands w15, w1, #1 + csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2 + csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2 + add x6, x6, w16, sxtw // srcU += incU + add x13, x13, w17, sxtw // srcV += incV .endm .macro increment_yuv422p - add x6, x6, w7, sxtw // srcU += incU - add x13, x13, w14, sxtw // srcV += incV + add x6, x6, w7, sxtw // srcU += incU + add x13, x13, w14, sxtw // srcV += incV .endm .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 - add v20.8h, v26.8h, v20.8h // Y1 + R1 - add v21.8h, v27.8h, v21.8h // Y2 + R2 - add v22.8h, v26.8h, v22.8h // Y1 + G1 - add v23.8h, v27.8h, v23.8h // Y2 + G2 - add v24.8h, v26.8h, v24.8h // Y1 + B1 - add v25.8h, v27.8h, v25.8h // Y2 + B2 - sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1) - sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1) - sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1) - sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1) - sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1) - sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1) - movi \a1, #255 - movi \a2, #255 + add v20.8h, v26.8h, v20.8h // Y1 + R1 + add v21.8h, v27.8h, v21.8h // Y2 + R2 + add v22.8h, v26.8h, v22.8h // Y1 + G1 + add v23.8h, v27.8h, v23.8h // Y2 + G2 + add v24.8h, v26.8h, v24.8h // Y1 + B1 + add v25.8h, v27.8h, v25.8h // Y2 + B2 + sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1) + sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1) + sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1) + sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1) + sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1) + sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1) + movi \a1, #255 + movi \a2, #255 .endm .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 load_args_\ifmt - mov w9, w1 + mov w9, w1 1: - mov w8, w0 // w8 = width + mov w8, w0 // w8 = width 2: - movi v5.8h, #4, lsl #8 // 128 * (1<<3) + movi v5.8h, #4, lsl #8 // 128 * (1<<3) load_chroma_\ifmt - sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3) - sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3) - sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R) - sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g - sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g - add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G) - sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B) - zip2 v21.8h, v20.8h, v20.8h // R2 - zip1 v20.8h, v20.8h, v20.8h // R1 - zip2 v23.8h, v22.8h, v22.8h // G2 - zip1 v22.8h, v22.8h, v22.8h // G1 - zip2 v25.8h, v24.8h, v24.8h // B2 - zip1 v24.8h, v24.8h, v24.8h // B1 - ld1 {v2.16b}, [x4], #16 // load luma - ushll v26.8h, v2.8b, #3 // Y1*(1<<3) - ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3) - sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset - sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset - sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15 - sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15 + sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3) + sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3) + sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R) + sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g + sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g + add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G) + sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B) + zip2 v21.8h, v20.8h, v20.8h // R2 + zip1 v20.8h, v20.8h, v20.8h // R1 + zip2 v23.8h, v22.8h, v22.8h // G2 + zip1 v22.8h, v22.8h, v22.8h // G1 + zip2 v25.8h, v24.8h, v24.8h // B2 + zip1 v24.8h, v24.8h, v24.8h // B1 + ld1 {v2.16b}, [x4], #16 // load luma + ushll v26.8h, v2.8b, #3 // Y1*(1<<3) + ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3) + sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset + sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset + sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15 + sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15 .ifc \ofmt,argb // 1 2 3 0 - compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b + compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b .endif .ifc \ofmt,rgba // 0 1 2 3 - compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b + compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b .endif .ifc \ofmt,abgr // 3 2 1 0 - compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b + compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b .endif .ifc \ofmt,bgra // 2 1 0 3 - compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b + compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b .endif - st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32 - st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 - subs w8, w8, #16 // width -= 16 - b.gt 2b - add x2, x2, w3, sxtw // dst += padding - add x4, x4, w5, sxtw // srcY += paddingY + st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32 + st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 + subs w8, w8, #16 // width -= 16 + b.gt 2b + add x2, x2, w3, sxtw // dst += padding + add x4, x4, w5, sxtw // srcY += paddingY increment_\ifmt - subs w1, w1, #1 // height -= 1 - b.gt 1b - mov w0, w9 + subs w1, w1, #1 // height -= 1 + b.gt 1b + mov w0, w9 ret endfunc .endm .macro declare_rgb_funcs ifmt - declare_func \ifmt, argb - declare_func \ifmt, rgba - declare_func \ifmt, abgr - declare_func \ifmt, bgra + declare_func \ifmt, argb + declare_func \ifmt, rgba + declare_func \ifmt, abgr + declare_func \ifmt, bgra .endm declare_rgb_funcs nv12