arm: vp9itxfm16: Make the larger core transforms standalone functions

This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/arm/vp9itxfm_16bpp_neon.o from
17500 to 14516 bytes.

This gives a small slowdown of a couple tens of cycles, up to around
150 cycles for the full case of the largest transform, but makes
it more feasible to add more optimized versions of these transforms.

Before:                                 Cortex A7       A8       A9      A53
vp9_inv_dct_dct_16x16_sub4_add_10_neon:    4237.4   3561.5   3971.8   2525.3
vp9_inv_dct_dct_16x16_sub16_add_10_neon:   6371.9   5452.0   5779.3   3910.5
vp9_inv_dct_dct_32x32_sub4_add_10_neon:   22068.8  17867.5  19555.2  13871.6
vp9_inv_dct_dct_32x32_sub32_add_10_neon:  37268.9  38684.2  32314.2  23969.0

After:
vp9_inv_dct_dct_16x16_sub4_add_10_neon:    4375.1   3571.9   4283.8   2567.2
vp9_inv_dct_dct_16x16_sub16_add_10_neon:   6415.6   5578.9   5844.6   3948.3
vp9_inv_dct_dct_32x32_sub4_add_10_neon:   22653.7  18079.7  19603.7  13905.3
vp9_inv_dct_dct_32x32_sub32_add_10_neon:  37593.2  38862.2  32235.8  24070.9

Signed-off-by: Martin Storsjö <martin@martin.st>
pull/244/merge
Martin Storsjö 8 years ago
parent b76533f105
commit 0ea603203d
  1. 43
      libavcodec/arm/vp9itxfm_16bpp_neon.S

@ -807,7 +807,7 @@ function idct16x16_dc_add_neon
endfunc
.ltorg
.macro idct16
function idct16
mbutterfly0 d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a
mbutterfly d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a
mbutterfly d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a
@ -853,9 +853,10 @@ endfunc
vmov d8, d21 @ d8 = t10a
butterfly d20, d27, d10, d27 @ d20 = out[4], d27 = out[11]
butterfly d21, d26, d26, d8 @ d21 = out[5], d26 = out[10]
.endm
bx lr
endfunc
.macro iadst16
function iadst16
movrel r12, iadst16_coeffs
vld1.16 {q0}, [r12,:128]!
vmovl.s16 q1, d1
@ -933,7 +934,8 @@ endfunc
vmov d16, d2
vmov d30, d4
.endm
bx lr
endfunc
.macro itxfm16_1d_funcs txfm
@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
@ -941,6 +943,8 @@ endfunc
@ r0 = dst (temp buffer)
@ r2 = src
function \txfm\()16_1d_2x16_pass1_neon
push {lr}
mov r12, #64
vmov.s32 q4, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@ -948,7 +952,7 @@ function \txfm\()16_1d_2x16_pass1_neon
vst1.32 {d8}, [r2,:64], r12
.endr
\txfm\()16
bl \txfm\()16
@ Do eight 2x2 transposes. Originally, d16-d31 contain the
@ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
@ -959,7 +963,7 @@ function \txfm\()16_1d_2x16_pass1_neon
.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
vst1.32 {d\i}, [r0,:64]!
.endr
bx lr
pop {pc}
endfunc
@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
@ -968,6 +972,8 @@ endfunc
@ r1 = dst stride
@ r2 = src (temp buffer)
function \txfm\()16_1d_2x16_pass2_neon
push {lr}
mov r12, #64
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64], r12
@ -975,7 +981,7 @@ function \txfm\()16_1d_2x16_pass2_neon
add r3, r0, r1
lsl r1, r1, #1
\txfm\()16
bl \txfm\()16
.macro load_add_store coef0, coef1, coef2, coef3
vrshr.s32 \coef0, \coef0, #6
@ -1019,7 +1025,7 @@ function \txfm\()16_1d_2x16_pass2_neon
load_add_store q12, q13, q14, q15
.purgem load_add_store
bx lr
pop {pc}
endfunc
.endm
@ -1193,7 +1199,7 @@ function idct32x32_dc_add_neon
pop {r4-r9,pc}
endfunc
.macro idct32_odd
function idct32_odd
movrel r12, idct_coeffs
@ Overwrite the idct16 coeffs with the stored ones for idct32
@ -1262,7 +1268,8 @@ endfunc
mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22
mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
.endm
bx lr
endfunc
@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
@ We don't have register space to do a single pass IDCT of 2x32 though,
@ -1274,6 +1281,8 @@ endfunc
@ r1 = unused
@ r2 = src
function idct32_1d_2x32_pass1_neon
push {lr}
@ Double stride of the input, since we only read every other line
mov r12, #256
vmov.s32 d8, #0
@ -1284,7 +1293,7 @@ function idct32_1d_2x32_pass1_neon
vst1.32 {d8}, [r2,:64], r12
.endr
idct16
bl idct16
@ Do eight 2x2 transposes. Originally, d16-d31 contain the
@ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
@ -1319,7 +1328,7 @@ function idct32_1d_2x32_pass1_neon
vst1.16 {d8}, [r2,:64], r12
.endr
idct32_odd
bl idct32_odd
transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
@ -1343,7 +1352,7 @@ function idct32_1d_2x32_pass1_neon
store_rev 31, 29, 27, 25, 23, 21, 19, 17
store_rev 30, 28, 26, 24, 22, 20, 18, 16
.purgem store_rev
bx lr
pop {pc}
endfunc
.ltorg
@ -1354,6 +1363,8 @@ endfunc
@ r1 = dst stride
@ r2 = src (temp buffer)
function idct32_1d_2x32_pass2_neon
push {lr}
mov r12, #256
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@ -1361,7 +1372,7 @@ function idct32_1d_2x32_pass2_neon
.endr
sub r2, r2, r12, lsl #4
idct16
bl idct16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vst1.32 {d\i}, [r2,:64], r12
@ -1377,7 +1388,7 @@ function idct32_1d_2x32_pass2_neon
sub r2, r2, r12, lsl #4
sub r2, r2, #128
idct32_odd
bl idct32_odd
@ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
@ allow clobbering q2-q3 below.
@ -1439,7 +1450,7 @@ function idct32_1d_2x32_pass2_neon
vmovl.s16 q3, d3
vmovl.s16 q1, d1
vmovl.s16 q0, d0
bx lr
pop {pc}
endfunc
const min_eob_idct_idct_32, align=4

Loading…
Cancel
Save