checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. Note that the C
version can still outperform the NEON version in specific cases. The balance
between different code paths is stream-dependent, but in practice the best
case happens about 5% of the time, the worst case happens about 40% of the
time, and the complexity of the remaining cases fall somewhere in between.
Therefore, taking the average of the best and worst case timings is
probably a conservative estimate of the degree by which the NEON code
improves performance.
vc1dsp.vc1_h_loop_filter4_bestcase_c: 19.0
vc1dsp.vc1_h_loop_filter4_bestcase_neon: 48.5
vc1dsp.vc1_h_loop_filter4_worstcase_c: 144.7
vc1dsp.vc1_h_loop_filter4_worstcase_neon: 76.2
vc1dsp.vc1_h_loop_filter8_bestcase_c: 41.0
vc1dsp.vc1_h_loop_filter8_bestcase_neon: 75.0
vc1dsp.vc1_h_loop_filter8_worstcase_c: 294.0
vc1dsp.vc1_h_loop_filter8_worstcase_neon: 102.7
vc1dsp.vc1_h_loop_filter16_bestcase_c: 54.7
vc1dsp.vc1_h_loop_filter16_bestcase_neon: 130.0
vc1dsp.vc1_h_loop_filter16_worstcase_c: 569.7
vc1dsp.vc1_h_loop_filter16_worstcase_neon: 186.7
vc1dsp.vc1_v_loop_filter4_bestcase_c: 20.2
vc1dsp.vc1_v_loop_filter4_bestcase_neon: 47.2
vc1dsp.vc1_v_loop_filter4_worstcase_c: 164.2
vc1dsp.vc1_v_loop_filter4_worstcase_neon: 68.5
vc1dsp.vc1_v_loop_filter8_bestcase_c: 43.5
vc1dsp.vc1_v_loop_filter8_bestcase_neon: 55.2
vc1dsp.vc1_v_loop_filter8_worstcase_c: 316.2
vc1dsp.vc1_v_loop_filter8_worstcase_neon: 72.7
vc1dsp.vc1_v_loop_filter16_bestcase_c: 62.2
vc1dsp.vc1_v_loop_filter16_bestcase_neon: 103.7
vc1dsp.vc1_v_loop_filter16_worstcase_c: 646.5
vc1dsp.vc1_v_loop_filter16_worstcase_neon: 110.7
Signed-off-by: Ben Avison <bavison@riscosopen.org>
Signed-off-by: Martin Storsjö <martin@martin.st>
@ -1161,3 +1161,646 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1
vst1.32{d1[1]}, [r0,:32]
bx lr
endfunc
@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
@ On entry:
@ r0 -> top-left pel of lower block
@ r1 = row stride, bytes
@ r2 = PQUANT bitstream parameter
function ff_vc1_v_loop_filter4_neon, export=1
sub r3, r0, r1, lsl #2
vldr d0, .Lcoeffs
vld1.32{d1[0]}, [r0], r1@ P5
vld1.32{d2[0]}, [r3], r1@ P1
vld1.32{d3[0]}, [r3], r1@ P2
vld1.32{d4[0]}, [r0], r1@ P6
vld1.32{d5[0]}, [r3], r1@ P3
vld1.32{d6[0]}, [r0], r1@ P7
vld1.32{d7[0]}, [r3]@ P4
vld1.32{d16[0]}, [r0]@ P8
vshll.u8q9, d1, #1@ 2*P5
vdup.16d17, r2@ pq
vshll.u8q10, d2, #1@ 2*P1
vmovl.u8q11, d3@ P2
vmovl.u8q1, d4@ P6
vmovl.u8q12, d5@ P3
vmls.i16d20, d22, d0[1]@ 2*P1-5*P2
vmovl.u8q11, d6@ P7
vmls.i16d18, d2, d0[1]@ 2*P5-5*P6
vshll.u8q2, d5, #1@ 2*P3
vmovl.u8q3, d7@ P4
vmla.i16d18, d22, d0[1]@ 2*P5-5*P6+5*P7
vmovl.u8q11, d16@ P8
vmla.u16d20, d24, d0[1]@ 2*P1-5*P2+5*P3
vmovl.u8q12, d1@ P5
vmls.u16d4, d6, d0[1]@ 2*P3-5*P4
vmls.u16d18, d22, d0[0]@ 2*P5-5*P6+5*P7-2*P8
vsub.i16d1, d6, d24@ P4-P5
vmls.i16d20, d6, d0[0]@ 2*P1-5*P2+5*P3-2*P4
vmla.i16d4, d24, d0[1]@ 2*P3-5*P4+5*P5
vmls.i16d4, d2, d0[0]@ 2*P3-5*P4+5*P5-2*P6
vabs.s16d2, d1
vrshr.s16d3, d18, #3
vrshr.s16d5, d20, #3
vshr.s16d2, d2, #1@ clip
vrshr.s16d4, d4, #3
vabs.s16d3, d3@ a2
vshr.s16d1, d1, #8@ clip_sign
vabs.s16d5, d5@ a1
vceq.i16d7, d2, #0@ test clip == 0
vabs.s16d16, d4@ a0
vshr.s16d4, d4, #8@ a0_sign
vcge.s16d18, d5, d3@ test a1 >= a2
vcge.s16d17, d16, d17@ test a0 >= pq
vbsl d18, d3, d5@ a3
vsub.i16d1, d1, d4@ clip_sign - a0_sign
vorr d3, d7, d17@ test clip == 0 || a0 >= pq
vqsub.u16d4, d16, d18@ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
vmov d2, d3@ needs to be in an even-numbered vector for when we come to narrow it later
vmls.i16d20, d4, d0[1]@ 2*P3-5*P4
vmla.i16d20, d3, d0[1]@ 2*P3-5*P4+5*P5
vsub.i16d3, d4, d2@ P4-P5
vmls.i16d20, d17, d0[0]@ 2*P3-5*P4+5*P5-2*P6
vrshr.s16q3, q3, #3
vabs.s16d5, d3
vshr.s16d3, d3, #8@ clip_sign
vrshr.s16d16, d20, #3
vabs.s16q3, q3@ a1, a2
vshr.s16d5, d5, #1@ clip
vabs.s16d17, d16@ a0
vceq.i16d18, d5, #0@ test clip == 0
vshr.s16d16, d16, #8@ a0_sign
vcge.s16d19, d6, d7@ test a1 >= a2
vcge.s16d1, d17, d1@ test a0 >= pq
vsub.i16d16, d3, d16@ clip_sign - a0_sign
vbsl d19, d7, d6@ a3
vorr d1, d18, d1@ test clip == 0 || a0 >= pq
vqsub.u16d3, d17, d19@ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
bne 1f@ none of the 4 pixel pairs should be updated if this one is not filtered
vbsl d3, d5, d0@ FFMIN(d, clip)
vbic d0, d3, d1@ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
vmla.i16d2, d0, d16@ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
vmls.i16d4, d0, d16@ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
vqmovun.s16d1, q1
vqmovun.s16d0, q2
vst2.8{d0[0], d1[0]}, [r0], r1
vst2.8{d0[1], d1[1]}, [r0], r1
vst2.8{d0[2], d1[2]}, [r0], r1
vst2.8{d0[3], d1[3]}, [r0]
1: bx lr
endfunc
@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
@ On entry:
@ r0 -> top-left pel of lower block
@ r1 = row stride, bytes
@ r2 = PQUANT bitstream parameter
function ff_vc1_v_loop_filter8_neon, export=1
sub r3, r0, r1, lsl #2
vldr d0, .Lcoeffs
vld1.32{d1}, [r0:64], r1@ P5
vld1.32{d2}, [r3:64], r1@ P1
vld1.32{d3}, [r3:64], r1@ P2
vld1.32{d4}, [r0:64], r1@ P6
vld1.32{d5}, [r3:64], r1@ P3
vld1.32{d6}, [r0:64], r1@ P7
vshll.u8q8, d1, #1@ 2*P5
vshll.u8q9, d2, #1@ 2*P1
vld1.32{d7}, [r3:64]@ P4
vmovl.u8q1, d3@ P2
vld1.32{d20}, [r0:64]@ P8
vmovl.u8q11, d4@ P6
vdup.16q12, r2@ pq
vmovl.u8q13, d5@ P3
vmls.i16q9, q1, d0[1]@ 2*P1-5*P2
vmovl.u8q1, d6@ P7
vshll.u8q2, d5, #1@ 2*P3
vmls.i16q8, q11, d0[1]@ 2*P5-5*P6
vmovl.u8q3, d7@ P4
vmovl.u8q10, d20@ P8
vmla.i16q8, q1, d0[1]@ 2*P5-5*P6+5*P7
vmovl.u8q1, d1@ P5
vmla.i16q9, q13, d0[1]@ 2*P1-5*P2+5*P3
vsub.i16q13, q3, q1@ P4-P5
vmls.i16q2, q3, d0[1]@ 2*P3-5*P4
vmls.i16q8, q10, d0[0]@ 2*P5-5*P6+5*P7-2*P8
vabs.s16q10, q13
vshr.s16q13, q13, #8@ clip_sign
vmls.i16q9, q3, d0[0]@ 2*P1-5*P2+5*P3-2*P4
vshr.s16q10, q10, #1@ clip
vmla.i16q2, q1, d0[1]@ 2*P3-5*P4+5*P5
vrshr.s16q8, q8, #3
vmls.i16q2, q11, d0[0]@ 2*P3-5*P4+5*P5-2*P6
vceq.i16q11, q10, #0@ test clip == 0
vrshr.s16q9, q9, #3
vabs.s16q8, q8@ a2
vabs.s16q9, q9@ a1
vrshr.s16q2, q2, #3
vcge.s16q14, q9, q8@ test a1 >= a2
vabs.s16q15, q2@ a0
vshr.s16q2, q2, #8@ a0_sign
vbsl q14, q8, q9@ a3
vcge.s16q8, q15, q12@ test a0 >= pq
vsub.i16q2, q13, q2@ clip_sign - a0_sign
vqsub.u16q9, q15, q14@ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
vqsub.u16q10, q15, q14@ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
vqsub.u16q15, q2, q4@ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
vqsub.u16q13, q4, q12@ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
vbic q2, q12, q2@ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
vorr q4, q8, q4
and r2, r2, r12
vbsl q10, q7, q0@ FFMIN(d[8..15], clip[8..15])
vmls.i16q14, q2, q3@ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
and r0, r0, r2
vbic q0, q10, q4@ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
tst r0, #1
bne 1f@ none of the 16 pixel pairs should be updated in this case
vmla.i16q6, q2, q3@ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
vmls.i16q9, q0, q5@ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
vqmovun.s16d4, q14
vmla.i16q1, q0, q5@ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
vqmovun.s16d0, q6
vqmovun.s16d5, q9
vqmovun.s16d1, q1
vst1.64{q2}, [r3:128], r1
vst1.64{q0}, [r3:128]
1: vpop {d8-d15}
bx lr
endfunc
@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
vqsub.u16q15, q10, q3@ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
vqsub.u16q14, q2, q13@ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
vbic q0, q14, q10@ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
vmov.32r6, d7[1]
and r12, r2, r3
vbic q2, q2, q9@ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
vmls.i16q6, q0, q7@ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
vmls.i16q5, q2, q1@ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
and r14, r5, r6
vmla.i16q4, q2, q1@ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
and r12, r12, r14
vqmovun.s16d4, q6
vmla.i16q8, q0, q7@ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
tst r12, #1
bne 4f@ none of the 16 pixel pairs should be updated in this case