ARM: allow building in Thumb2 mode

Signed-off-by: Mans Rullgard <mans@mansr.com>
pull/2/head
Mans Rullgard 14 years ago
parent 9cd7b8549b
commit 8986fddc2b
  1. 3
      configure
  2. 3
      libavcodec/arm/aac.h
  3. 1
      libavcodec/arm/ac3dsp_arm.S
  4. 2
      libavcodec/arm/ac3dsp_armv6.S
  5. 1
      libavcodec/arm/ac3dsp_neon.S
  6. 93
      libavcodec/arm/asm.S
  7. 1
      libavcodec/arm/dcadsp_neon.S
  8. 10
      libavcodec/arm/dsputil_arm.S
  9. 76
      libavcodec/arm/dsputil_armv6.S
  10. 3
      libavcodec/arm/dsputil_neon.S
  11. 21
      libavcodec/arm/dsputil_vfp.S
  12. 4
      libavcodec/arm/fmtconvert_neon.S
  13. 3
      libavcodec/arm/fmtconvert_vfp.S
  14. 98
      libavcodec/arm/h264dsp_neon.S
  15. 23
      libavcodec/arm/h264idct_neon.S
  16. 3
      libavcodec/arm/mathops.h
  17. 4
      libavcodec/arm/mdct_neon.S
  18. 6
      libavcodec/arm/mpegaudiodsp_fixed_armv6.S
  19. 12
      libavcodec/arm/mpegvideo_armv5te_s.S
  20. 4
      libavcodec/arm/mpegvideo_neon.S
  21. 1
      libavcodec/arm/rdft_neon.S
  22. 32
      libavcodec/arm/simple_idct_arm.S
  23. 39
      libavcodec/arm/simple_idct_armv5te.S
  24. 33
      libavcodec/arm/simple_idct_armv6.S
  25. 6
      libavcodec/arm/simple_idct_neon.S
  26. 2
      libavcodec/arm/synth_filter_neon.S
  27. 27
      libavcodec/arm/vp56_arith.h
  28. 36
      libavcodec/arm/vp8_armv6.S
  29. 16
      libavcodec/arm/vp8dsp_neon.S
  30. 2
      libavutil/arm/intmath.h

3
configure vendored

@ -967,6 +967,7 @@ CONFIG_LIST="
static
swscale
swscale_alpha
thumb
vaapi
vdpau
version3
@ -2607,7 +2608,7 @@ if enabled alpha; then
elif enabled arm; then
check_cflags -marm
enabled thumb && check_cflags -mthumb || check_cflags -marm
nogas=die
if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then

@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
"vmov d1, %2, %3 \n\t"
"lsls %6, %6, #1 \n\t"
"and %0, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %1, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %2, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"vmov d4, %0, %1 \n\t"
"and %3, %5, #1<<31 \n\t"

@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1
lsl r3, lr, #1
ldrh r12, [r0, r3]
subs r2, r2, #1
it gt
ldrbgt lr, [r1], #1
add r12, r12, #1
strh r12, [r0, r3]

@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1
mov r11, r10
ldrb r10, [r4], #1 @ band_start_tab[band++]
subs r9, r9, r5 @ - floor
it lt
movlt r9, #0
cmp r10, r3 @ - end
and r9, r9, r8 @ & 0x1fe0
ite gt
subgt r8, r3, r11
suble r8, r10, r11
add r9, r9, r5 @ + floor => m

@ -41,6 +41,7 @@ endfunc
function ff_ac3_exponent_min_neon, export=1
cmp r1, #0
it eq
bxeq lr
push {lr}
mov r12, #256

@ -24,9 +24,18 @@
# define ELF
#else
# define ELF @
#endif
#if CONFIG_THUMB
# define A @
# define T
#else
# define A
# define T @
#endif
.syntax unified
T .thumb
.macro require8 val=1
ELF .eabi_attribute 24, \val
@ -82,6 +91,90 @@ ELF .size \name, . - \name
#endif
.endm
.macro ldr_pre rt, rn, rm:vararg
A ldr \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T ldr \rt, [\rn]
.endm
.macro ldr_post rt, rn, rm:vararg
A ldr \rt, [\rn], \rm
T ldr \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro ldrd_reg rt, rt2, rn, rm
A ldrd \rt, \rt2, [\rn, \rm]
T add \rt, \rn, \rm
T ldrd \rt, \rt2, [\rt]
.endm
.macro ldrd_post rt, rt2, rn, rm
A ldrd \rt, \rt2, [\rn], \rm
T ldrd \rt, \rt2, [\rn]
T add \rn, \rn, \rm
.endm
.macro ldrh_pre rt, rn, rm
A ldrh \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T ldrh \rt, [\rn]
.endm
.macro ldrh_dpre rt, rn, rm
A ldrh \rt, [\rn, -\rm]!
T sub \rn, \rn, \rm
T ldrh \rt, [\rn]
.endm
.macro ldrh_post rt, rn, rm
A ldrh \rt, [\rn], \rm
T ldrh \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro str_post rt, rn, rm:vararg
A str \rt, [\rn], \rm
T str \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strb_post rt, rn, rm:vararg
A strb \rt, [\rn], \rm
T strb \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strd_post rt, rt2, rn, rm
A strd \rt, \rt2, [\rn], \rm
T strd \rt, \rt2, [\rn]
T add \rn, \rn, \rm
.endm
.macro strh_pre rt, rn, rm
A strh \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T strh \rt, [\rn]
.endm
.macro strh_dpre rt, rn, rm
A strh \rt, [\rn, -\rm]!
T sub \rn, \rn, \rm
T strh \rt, [\rn]
.endm
.macro strh_post rt, rn, rm
A strh \rt, [\rn], \rm
T strh \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strh_dpost rt, rn, rm
A strh \rt, [\rn], -\rm
T strh \rt, [\rn]
T sub \rn, \rn, \rm
.endm
#if HAVE_VFP_ARGS
.eabi_attribute 28, 1
# define VFP

@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1
add r5, r2, #256*4-16 @ cf1
sub r1, r1, #12
cmp r3, #32
ite eq
moveq r6, #256/32
movne r6, #256/64
NOVFP vldr s0, [sp, #16] @ scale

@ -554,10 +554,12 @@ endfunc
and r9, r5, r14
and r10, r6, r14
and r11, r7, r14
it eq
andeq r14, r14, r14, \rnd #1
add r8, r8, r10
add r9, r9, r11
ldr r12, =0xfcfcfcfc >> 2
itt eq
addeq r8, r8, r14
addeq r9, r9, r14
and r4, r12, r4, lsr #2
@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #4] /* moved form [A] */
@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
ldr r4, [r1, #4] /* moved form [B] */
@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #12] /* moved from [D] */
@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
add r0, r0, #16 /* moved from [E] */

@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1
ldr r5, [r1, #4]
ldr r6, [r1, #8]
ldr r7, [r1, #12]
ldr r4, [r1], r2
ldr_post r4, r1, r2
strd r6, r7, [r0, #8]
ldr r9, [r1, #4]
strd r4, r5, [r0], r2
strd_post r4, r5, r0, r2
ldr r10, [r1, #8]
ldr r11, [r1, #12]
ldr r8, [r1], r2
ldr_post r8, r1, r2
strd r10, r11, [r0, #8]
subs r3, r3, #2
strd r8, r9, [r0], r2
strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11}
@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1
push {r4-r7}
1:
ldr r5, [r1, #4]
ldr r4, [r1], r2
ldr_post r4, r1, r2
ldr r7, [r1, #4]
strd r4, r5, [r0], r2
ldr r6, [r1], r2
strd_post r4, r5, r0, r2
ldr_post r6, r1, r2
subs r3, r3, #2
strd r6, r7, [r0], r2
strd_post r6, r7, r0, r2
bne 1b
pop {r4-r7}
@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1
ldr r5, [r1, #4]
ldr r7, [r1, #5]
lsr r6, r4, #8
ldr r8, [r1, r2]!
ldr_pre r8, r1, r2
orr r6, r6, r5, lsl #24
ldr r9, [r1, #4]
ldr r11, [r1, #5]
@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1
uhadd8 r9, r9, r11
and r6, r6, r12
uadd8 r8, r8, r14
strd r4, r5, [r0], r2
strd_post r4, r5, r0, r2
uadd8 r9, r9, r6
strd r8, r9, [r0], r2
strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11, pc}
@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1
orr r12, r12, r12, lsl #16
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r6, [r1, r2]!
ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1
uhadd8 r9, r5, r7
eor r11, r5, r7
and r10, r10, r12
ldr r4, [r1, r2]!
ldr_pre r4, r1, r2
uadd8 r8, r8, r10
and r11, r11, r12
uadd8 r9, r9, r11
@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1
eor r7, r5, r7
uadd8 r10, r10, r6
and r7, r7, r12
ldr r6, [r1, r2]!
ldr_pre r6, r1, r2
uadd8 r11, r11, r7
strd r8, r9, [r0], r2
strd_post r8, r9, r0, r2
ldr r7, [r1, #4]
strd r10, r11, [r0], r2
strd_post r10, r11, r0, r2
bne 1b
pop {r4-r11}
@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r7, [r1, #5]
ldr r8, [r1, r2]!
ldr_pre r8, r1, r2
ldr r9, [r1, #4]
ldr r14, [r1, #5]
add r1, r1, r2
@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1
push {r4-r9, lr}
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r6, [r1, r2]!
ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r8, r4, r6
ldr r4, [r1, r2]!
ldr_pre r4, r1, r2
uhadd8 r9, r5, r7
ldr r5, [r1, #4]
uhadd8 r12, r4, r6
ldr r6, [r1, r2]!
ldr_pre r6, r1, r2
uhadd8 r14, r5, r7
ldr r7, [r1, #4]
stm r0, {r8,r9}
@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1
orr lr, lr, lr, lsl #16
ldrd r4, r5, [r0]
ldr r10, [r1, #4]
ldr r9, [r1], r2
ldr_post r9, r1, r2
subs r3, r3, #2
1:
pld [r1, r2]
eor r8, r4, r9
uhadd8 r4, r4, r9
eor r12, r5, r10
ldrd r6, r7, [r0, r2]
ldrd_reg r6, r7, r0, r2
uhadd8 r5, r5, r10
and r8, r8, lr
ldr r10, [r1, #4]
and r12, r12, lr
uadd8 r4, r4, r8
ldr r9, [r1], r2
ldr_post r9, r1, r2
eor r8, r6, r9
uadd8 r5, r5, r12
pld [r1, r2, lsl #1]
eor r12, r7, r10
uhadd8 r6, r6, r9
strd r4, r5, [r0], r2
strd_post r4, r5, r0, r2
uhadd8 r7, r7, r10
beq 2f
and r8, r8, lr
ldrd r4, r5, [r0, r2]
ldrd_reg r4, r5, r0, r2
uadd8 r6, r6, r8
ldr r10, [r1, #4]
and r12, r12, lr
subs r3, r3, #2
uadd8 r7, r7, r12
ldr r9, [r1], r2
strd r6, r7, [r0], r2
ldr_post r9, r1, r2
strd_post r6, r7, r0, r2
b 1b
2:
and r8, r8, lr
and r12, r12, lr
uadd8 r6, r6, r8
uadd8 r7, r7, r12
strd r6, r7, [r0], r2
strd_post r6, r7, r0, r2
pop {r4-r10, pc}
endfunc
@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1
orr r6, r8, r5, lsl #8
orr r7, r4, lr, lsl #8
subs r3, r3, #1
strd r6, r7, [r1], r2
strd_post r6, r7, r1, r2
bgt 1b
pop {r4-r8,pc}
endfunc
@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1
push {r4-r8, lr}
mov lr, #8
1:
ldrd r4, r5, [r1], r2
ldrd_post r4, r5, r1, r2
subs lr, lr, #1
uxtb16 r6, r4
uxtb16 r4, r4, ror #8
@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1
push {r4-r9, lr}
mov lr, #8
1:
ldrd r4, r5, [r1], r3
ldrd r6, r7, [r2], r3
ldrd_post r4, r5, r1, r3
ldrd_post r6, r7, r2, r3
uxtb16 r8, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r6
@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1
push {r4-r9, lr}
mov r0, #0
mov lr, #0
ldrd r4, r5, [r1], r3
ldrd_post r4, r5, r1, r3
1:
subs r12, r12, #2
ldr r7, [r2, #4]
ldr r6, [r2], r3
ldrd r8, r9, [r1], r3
ldr_post r6, r2, r3
ldrd_post r8, r9, r1, r3
usada8 r0, r4, r6, r0
pld [r2, r3]
usada8 lr, r5, r7, lr
ldr r7, [r2, #4]
ldr r6, [r2], r3
ldr_post r6, r2, r3
beq 2f
ldrd r4, r5, [r1], r3
ldrd_post r4, r5, r1, r3
usada8 r0, r8, r6, r0
pld [r2, r3]
usada8 lr, r9, r7, lr
@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1
ldr r7, [r0, #12]
usada8 r2, r6, lr, r2
beq 2f
ldr r4, [r0, r1]!
ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3
bgt 1b
2:

@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1
2: vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]!
it lt
bxlt lr
3: vld1.32 {d2-d3}, [r1,:128]
@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2
2: vst1.32 {q2},[r0,:128]!
vst1.32 {q3},[r0,:128]!
ands len, len, #15
it eq
bxeq lr
3: vld1.32 {q0},[r1,:128]!
vmul.f32 q0, q0, q8
@ -638,6 +640,7 @@ NOVFP ldr r3, [sp]
2: vst1.32 {q8},[r0,:128]!
vst1.32 {q9},[r0,:128]!
ands r3, r3, #7
it eq
popeq {pc}
3: vld1.32 {q0},[r1,:128]!
ldr r12, [r2], #4

@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1
1:
subs r3, r3, #16
vmul.f32 s12, s4, s12
itttt ge
vldmiage r1!, {s16-s19}
vldmiage r2!, {s24-s27}
vldmiage r1!, {s20-s23}
vldmiage r2!, {s28-s31}
it ge
vmulge.f32 s24, s16, s24
vstmia r0!, {s8-s11}
vstmia r0!, {s12-s15}
it ge
vmulge.f32 s28, s20, s28
itttt gt
vldmiagt r1!, {s0-s3}
vldmiagt r2!, {s8-s11}
vldmiagt r1!, {s4-s7}
vldmiagt r2!, {s12-s15}
ittt ge
vmulge.f32 s8, s0, s8
vstmiage r0!, {s24-s27}
vstmiage r0!, {s28-s31}
@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1
vmul.f32 s11, s0, s11
1:
subs r3, r3, #16
it ge
vldmdbge r2!, {s16-s19}
vmul.f32 s12, s7, s12
it ge
vldmiage r1!, {s24-s27}
vmul.f32 s13, s6, s13
it ge
vldmdbge r2!, {s20-s23}
vmul.f32 s14, s5, s14
it ge
vldmiage r1!, {s28-s31}
vmul.f32 s15, s4, s15
it ge
vmulge.f32 s24, s19, s24
it gt
vldmdbgt r2!, {s0-s3}
it ge
vmulge.f32 s25, s18, s25
vstmia r0!, {s8-s13}
it ge
vmulge.f32 s26, s17, s26
it gt
vldmiagt r1!, {s8-s11}
itt ge
vmulge.f32 s27, s16, s27
vmulge.f32 s28, s23, s28
it gt
vldmdbgt r2!, {s4-s7}
it ge
vmulge.f32 s29, s22, s29
vstmia r0!, {s14-s15}
ittt ge
vmulge.f32 s30, s21, s30
vmulge.f32 s31, s20, s31
vmulge.f32 s8, s3, s8
it gt
vldmiagt r1!, {s12-s15}
itttt ge
vmulge.f32 s9, s2, s9
vmulge.f32 s10, s1, s10
vstmiage r0!, {s24-s27}
vmulge.f32 s11, s0, s11
it ge
vstmiage r0!, {s28-s31}
bgt 1b

@ -71,6 +71,7 @@ endfunc
function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2
itt lt
ldrlt r1, [r1]
blt ff_float_to_int16_neon
bne 4f
@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip
subs r3, r3, #4
it eq
popeq {r4-r8,pc}
cmp r3, #4
add r0, r0, #8
@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2
add r0, r0, #4
it eq
popeq {r4-r8,pc}
@ 1 channel
@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
it eq
popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16

@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1
vmov r5, r6, s2, s3
vmov r7, r8, s4, s5
vmov ip, lr, s6, s7
it gt
vldmiagt r1!, {s16-s23}
ssat r4, #16, r4
ssat r3, #16, r3
@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1
ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16
itttt gt
vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19
itttt gt
vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22

@ -71,7 +71,9 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
pld [r1]
pld [r1, r2]
muls r7, r4, r5
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb ip, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
@ -197,7 +199,9 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
pld [r1]
pld [r1, r2]
muls r7, r4, r5
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb ip, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
@ -368,10 +372,10 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
pop {r4-r6, pc}
2:
.ifc \type,put
ldrh r5, [r1], r2
strh r5, [r0], r2
ldrh r6, [r1], r2
strh r6, [r0], r2
ldrh_post r5, r1, r2
strh_post r5, r0, r2
ldrh_post r6, r1, r2
strh_post r6, r0, r2
.else
vld1.16 {d16[0]}, [r1], r2
vld1.16 {d16[1]}, [r1], r2
@ -404,28 +408,17 @@ endfunc
ldr ip, [sp]
tst r2, r2
ldr ip, [ip]
it ne
tstne r3, r3
vmov.32 d24[0], ip
and ip, ip, ip, lsl #16
it eq
bxeq lr
ands ip, ip, ip, lsl #8
it lt
bxlt lr
.endm
.macro align_push_regs
and ip, sp, #15
add ip, ip, #32
sub sp, sp, ip
vst1.64 {d12-d15}, [sp,:128]
sub sp, sp, #32
vst1.64 {d8-d11}, [sp,:128]
.endm
.macro align_pop_regs
vld1.64 {d8-d11}, [sp,:128]!
vld1.64 {d12-d15}, [sp,:128], ip
.endm
.macro h264_loop_filter_luma
vdup.8 q11, r2 @ alpha
vmovl.u8 q12, d24
@ -506,7 +499,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
vld1.64 {d18,d19}, [r0,:128], r1
vld1.64 {d16,d17}, [r0,:128], r1
align_push_regs
vpush {d8-d15}
h264_loop_filter_luma
@ -516,7 +509,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
vst1.64 {d0, d1}, [r0,:128], r1
vst1.64 {d10,d11}, [r0,:128]
align_pop_regs
vpop {d8-d15}
bx lr
endfunc
@ -543,7 +536,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
align_push_regs
vpush {d8-d15}
h264_loop_filter_luma
@ -568,7 +561,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
vst1.32 {d1[1]}, [r0], r1
vst1.32 {d11[1]}, [r0], r1
align_pop_regs
vpop {d8-d15}
bx lr
endfunc
@ -1116,6 +1109,7 @@ function \type\()_h264_qpel8_hv_lowpass_neon
vrhadd.u8 d11, d11, d7
sub r0, r0, r2, lsl #3
.endif
vst1.64 {d12}, [r0,:64], r2
vst1.64 {d13}, [r0,:64], r2
vst1.64 {d14}, [r0,:64], r2
@ -1263,7 +1257,9 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1
\type\()_h264_qpel8_mc11:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #64
mov r0, sp
sub r1, r1, #2
@ -1271,14 +1267,14 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1
mov ip, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
ldrd r0, [r11]
ldrd r0, [r11], #8
mov r3, r2
add ip, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
mov sp, r11
pop {r11, pc}
endfunc
@ -1287,7 +1283,9 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1
\type\()_h264_qpel8_mc21:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, #2
mov r3, #8
@ -1296,14 +1294,14 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
mov r4, r0
ldrd r0, [r11]
ldrd r0, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
mov sp, r11
pop {r4, r10, r11, pc}
endfunc
@ -1330,7 +1328,9 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1
\type\()_h264_qpel8_mc12:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, r2, lsl #1
mov r3, r2
@ -1339,20 +1339,22 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1
vpush {d8-d15}
bl put_h264_qpel8_v_lowpass_neon
mov r4, r0
ldrd r0, [r11]
ldrd r0, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc22_neon, export=1
push {r4, r10, r11, lr}
mov r11, sp
bic sp, sp, #15
A bic sp, sp, #15
T bic r4, r11, #15
T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
@ -1441,21 +1443,23 @@ function ff_\type\()_h264_qpel16_mc11_neon, export=1
\type\()_h264_qpel16_mc11:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #256
mov r0, sp
sub r1, r1, #2
mov r3, #16
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon
ldrd r0, [r11]
ldrd r0, [r11], #8
mov r3, r2
add ip, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #16
bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
mov sp, r11
pop {r4, r11, pc}
endfunc
@ -1464,20 +1468,22 @@ function ff_\type\()_h264_qpel16_mc21_neon, export=1
\type\()_h264_qpel16_mc21:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, #2
mov r0, sp
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon_packed
mov r4, r0
ldrd r0, [r11]
ldrd r0, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
@ -1504,7 +1510,9 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1
\type\()_h264_qpel16_mc12:
lowpass_const r3
mov r11, sp
bic sp, sp, #15
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, r2, lsl #1
mov r0, sp
@ -1512,13 +1520,13 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1
vpush {d8-d15}
bl put_h264_qpel16_v_lowpass_neon_packed
mov r4, r0
ldrd r0, [r11]
ldrd r0, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
mov r2, r3
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
add sp, r11, #8
mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
@ -1526,7 +1534,9 @@ function ff_\type\()_h264_qpel16_mc22_neon, export=1
push {r4, r9-r11, lr}
lowpass_const r3
mov r11, sp
bic sp, sp, #15
A bic sp, sp, #15
T bic r4, r11, #15
T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2

@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
it ne
movne lr, #0
cmp lr, #0
adrne lr, ff_h264_idct_dc_add_neon
adreq lr, ff_h264_idct_add_neon
ite ne
adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
blx lr
2: subs ip, ip, #1
add r1, r1, #32
@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1
add r0, r0, r4
cmp r8, #0
ldrsh r8, [r1]
adrne lr, ff_h264_idct_add_neon
adreq lr, ff_h264_idct_dc_add_neon
iteet ne
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
subs ip, ip, #1
@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1
add r1, r3, r12, lsl #5
cmp r8, #0
ldrsh r8, [r1]
adrne lr, ff_h264_idct_add_neon
adreq lr, ff_h264_idct_dc_add_neon
iteet ne
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
add r12, r12, #1
cmp r12, #4
itt eq
moveq r12, #16
moveq r4, r9
cmp r12, #20
@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
it ne
movne lr, #0
cmp lr, #0
adrne lr, ff_h264_idct8_dc_add_neon
adreq lr, ff_h264_idct8_add_neon
ite ne
adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
blx lr
2: subs r12, r12, #4
add r1, r1, #128

@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c)
__asm__ (
"mov %0, %2 \n\t"
"cmp %1, %2 \n\t"
"itt gt \n\t"
"movgt %0, %1 \n\t"
"movgt %1, %2 \n\t"
"cmp %1, %3 \n\t"
"it le \n\t"
"movle %1, %3 \n\t"
"cmp %0, %1 \n\t"
"it gt \n\t"
"movgt %0, %1 \n\t"
: "=&r"(m), "+r"(a)
: "r"(b), "r"(c)

@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1
vadd.f32 d17, d17, d3 @ in2u+in1d -I
1:
vmul.f32 d7, d0, d21 @ I*s
ldr r10, [r3, lr, lsr #1]
A ldr r10, [r3, lr, lsr #1]
T lsr r10, lr, #1
T ldr r10, [r3, r10]
vmul.f32 d6, d1, d20 @ -R*c
ldr r6, [r3, #4]!
vmul.f32 d4, d1, d21 @ -R*s

@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
sum8 r8, r9, r1, r0, r10, r11, r12, lr
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
round r10, r8, r9
strh r10, [r3], r4
strh_post r10, r3, r4
mov lr, #15
1:
@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
round r10, r8, r9
adds r8, r8, r4
adc r9, r9, r7
strh r10, [r3], r12
strh_post r10, r3, r12
round r11, r8, r9
subs lr, lr, #1
strh r11, [r5], -r12
strh_dpost r11, r5, r12
bgt 1b
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33

@ -38,15 +38,21 @@
.macro dequant_t dst, src, mul, add, tmp
rsbs \tmp, ip, \src, asr #16
it gt
addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0
it ne
smlatbne \dst, \src, \mul, \tmp
.endm
.macro dequant_b dst, src, mul, add, tmp
rsbs \tmp, ip, \src, lsl #16
it gt
addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0
it ne
smlabbne \dst, \src, \mul, \tmp
.endm
@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1
strh lr, [r0], #2
subs r3, r3, #8
it gt
ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
bgt 1b
adds r3, r3, #2
it le
pople {r4-r9,pc}
2:
ldrsh r9, [r0, #0]
ldrsh lr, [r0, #2]
mov r8, r2
cmp r9, #0
it lt
rsblt r8, r2, #0
it ne
smlabbne r9, r9, r1, r8
mov r8, r2
cmp lr, #0
it lt
rsblt r8, r2, #0
it ne
smlabbne lr, lr, r1, r8
strh r9, [r0], #2
strh lr, [r0], #2

@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1
subs r3, r3, #16
vst1.16 {q0}, [r1,:128]!
vst1.16 {q8}, [r1,:128]!
it le
bxle lr
cmp r3, #8
bgt 1b
@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
ldr r6, [r0, #AC_PRED]
add lr, r0, #INTER_SCANTAB_RASTER_END
cmp r6, #0
it ne
movne r12, #63
bne 1f
ldr r12, [r12, r2, lsl #2]
@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1
ldrsh r4, [r1]
cmp r5, #0
mov r5, r1
it ne
movne r2, #0
bne 2f
cmp r2, #4
it ge
addge r0, r0, #4
sub r2, r3, #1
ldr r6, [r0, #Y_DC_SCALE]

@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1
vst1.32 {d22}, [r5,:64]
cmp r6, #0
it eq
popeq {r4-r8,pc}
vmul.f32 d22, d22, d18

@ -121,11 +121,13 @@ __b_evaluation:
ldr r11, [r12, #offW7] @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if null avoid muls
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if null avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@ -148,19 +150,23 @@ __b_evaluation:
@@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]);
mov r3, r3, asr #16 @ R3=ROWr16[5]
teq r3, #0 @ if null avoid muls
teq r3, #0 @ if null avoid muls
it ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
mov r4, r4, asr #16 @ R4=ROWr16[7]
itttt ne
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5]
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now
teq r4, #0 @ if null avoid muls
teq r4, #0 @ if null avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
@@ R4 is free now
__end_b_evaluation:
@ -204,16 +210,19 @@ __a_evaluation:
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #8] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
teq r11, #0 @ if null avoid muls
it ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
ldrsh r9, [r14, #12] @ R9=ROWr16[6]
itttt ne
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@ -222,6 +231,7 @@ __a_evaluation:
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
@ -323,10 +333,12 @@ __b_evaluation2:
ldrsh r2, [r14, #48]
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if 0, then avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@ -342,18 +354,22 @@ __b_evaluation2:
@@ MAC16(b1, -W5, col[7x8]);
ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
teq r3, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
it ne
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
@@ R3 is free now
teq r4, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
@@ R4 is free now
__end_b_evaluation2:
@ -390,15 +406,18 @@ __a_evaluation2:
@@ a3 += W4*row[4]
ldrsh r11, [r14, #64] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
itttt ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
ldrsh r9, [r14, #96] @ R9=ROWr16[6]
it ne
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@ -407,6 +426,7 @@ __a_evaluation2:
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation2:

@ -49,6 +49,7 @@ function idct_row_armv5te
ldrd v1, [a1, #8]
ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
orrs v1, v1, v2
itt eq
cmpeq v1, a4
cmpeq v1, a3, lsr #16
beq row_dc_only
@ -269,6 +270,7 @@ function idct_col_armv5te
ldmfd sp!, {a3, a4}
adds a2, a3, v1
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, v2
mov ip, ip, asr #20
@ -276,6 +278,7 @@ function idct_col_armv5te
str a2, [a1]
subs a3, a3, v1
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, v2
mov a4, a4, asr #20
@ -285,6 +288,7 @@ function idct_col_armv5te
subs a2, a3, v3
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
sub ip, a4, v4
mov ip, ip, asr #20
@ -292,6 +296,7 @@ function idct_col_armv5te
str a2, [a1, #(16*1)]
adds a3, a3, v3
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
add a4, a4, v4
mov a4, a4, asr #20
@ -301,6 +306,7 @@ function idct_col_armv5te
adds a2, a3, v5
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, v6
mov ip, ip, asr #20
@ -308,6 +314,7 @@ function idct_col_armv5te
str a2, [a1, #(16*2)]
subs a3, a3, v5
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, v6
mov a4, a4, asr #20
@ -317,6 +324,7 @@ function idct_col_armv5te
adds a2, a3, v7
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, fp
mov ip, ip, asr #20
@ -324,6 +332,7 @@ function idct_col_armv5te
str a2, [a1, #(16*3)]
subs a3, a3, v7
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, fp
mov a4, a4, asr #20
@ -335,15 +344,19 @@ endfunc
.macro clip dst, src:vararg
movs \dst, \src
it mi
movmi \dst, #0
cmp \dst, #255
it gt
movgt \dst, #255
.endm
.macro aclip dst, src:vararg
adds \dst, \src
it mi
movmi \dst, #0
cmp \dst, #255
it gt
movgt \dst, #255
.endm
@ -370,35 +383,35 @@ function idct_col_put_armv5te
orr a2, a3, a4, lsl #8
rsb v2, lr, lr, lsl #3
ldmfd sp!, {a3, a4}
strh a2, [v2, v1]!
strh_pre a2, v2, v1
sub a2, a3, v3
clip a2, a2, asr #20
sub ip, a4, v4
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
strh a2, [v1, lr]!
strh_pre a2, v1, lr
add a3, a3, v3
clip a2, a3, asr #20
add a4, a4, v4
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
strh a2, [v2, -lr]!
strh_dpre a2, v2, lr
add a2, a3, v5
clip a2, a2, asr #20
add ip, a4, v6
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
strh a2, [v1, lr]!
strh_pre a2, v1, lr
sub a3, a3, v5
clip a2, a3, asr #20
sub a4, a4, v6
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
strh a2, [v2, -lr]!
strh_dpre a2, v2, lr
add a2, a3, v7
clip a2, a2, asr #20
@ -411,7 +424,7 @@ function idct_col_put_armv5te
sub a4, a4, fp
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
strh a2, [v2, -lr]
strh_dpre a2, v2, lr
ldr pc, [sp], #4
endfunc
@ -436,7 +449,7 @@ function idct_col_add_armv5te
ldr v1, [sp, #32]
sub a4, a4, v2
rsb v2, v1, v1, lsl #3
ldrh ip, [v2, lr]!
ldrh_pre ip, v2, lr
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
@ -448,7 +461,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]!
ldrh_pre ip, lr, v1
sub a2, a3, v3
add a3, a3, v3
and v3, ip, #255
@ -458,7 +471,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
add a4, a4, v4
ldrh ip, [v2, -v1]!
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
@ -468,7 +481,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]!
ldrh_pre ip, lr, v1
add a2, a3, v5
sub a3, a3, v5
and v3, ip, #255
@ -478,7 +491,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, v6
ldrh ip, [v2, -v1]!
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
@ -488,7 +501,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]!
ldrh_pre ip, lr, v1
add a2, a3, v7
sub a3, a3, v7
and v3, ip, #255
@ -498,7 +511,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, fp
ldrh ip, [v2, -v1]!
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20

@ -200,6 +200,7 @@ function idct_row_armv6
ldr r3, [r0, #8] /* r3 = row[3,1] */
ldr r2, [r0] /* r2 = row[2,0] */
orrs lr, lr, ip
itt eq
cmpeq lr, r3
cmpeq lr, r2, lsr #16
beq 1f
@ -282,14 +283,14 @@ function idct_col_put_armv6
pop {r1, r2}
idct_finish_shift_sat COL_SHIFT
strb r4, [r1], r2
strb r5, [r1], r2
strb r6, [r1], r2
strb r7, [r1], r2
strb r11,[r1], r2
strb r10,[r1], r2
strb r9, [r1], r2
strb r8, [r1], r2
strb_post r4, r1, r2
strb_post r5, r1, r2
strb_post r6, r1, r2
strb_post r7, r1, r2
strb_post r11,r1, r2
strb_post r10,r1, r2
strb_post r9, r1, r2
strb_post r8, r1, r2
sub r1, r1, r2, lsl #3
@ -318,16 +319,16 @@ function idct_col_add_armv6
add ip, r3, ip, asr #COL_SHIFT
usat ip, #8, ip
add r4, r7, r4, asr #COL_SHIFT
strb ip, [r1], r2
strb_post ip, r1, r2
ldrb ip, [r1, r2]
usat r4, #8, r4
ldrb r11,[r1, r2, lsl #2]
add r5, ip, r5, asr #COL_SHIFT
usat r5, #8, r5
strb r4, [r1], r2
strb_post r4, r1, r2
ldrb r3, [r1, r2]
ldrb ip, [r1, r2, lsl #2]
strb r5, [r1], r2
strb_post r5, r1, r2
ldrb r7, [r1, r2]
ldrb r4, [r1, r2, lsl #2]
add r6, r3, r6, asr #COL_SHIFT
@ -340,11 +341,11 @@ function idct_col_add_armv6
usat r8, #8, r8
add lr, r4, lr, asr #COL_SHIFT
usat lr, #8, lr
strb r6, [r1], r2
strb r10,[r1], r2
strb r9, [r1], r2
strb r8, [r1], r2
strb lr, [r1], r2
strb_post r6, r1, r2
strb_post r10,r1, r2
strb_post r9, r1, r2
strb_post r8, r1, r2
strb_post lr, r1, r2
sub r1, r1, r2, lsl #3

@ -71,7 +71,7 @@ function idct_row4_pld_neon
add r3, r0, r1, lsl #2
pld [r0, r1]
pld [r0, r1, lsl #1]
pld [r3, -r1]
A pld [r3, -r1]
pld [r3]
pld [r3, r1]
add r3, r3, r1, lsl #1
@ -164,6 +164,7 @@ function idct_col4_neon
orrs r4, r4, r5
idct_col4_top
it eq
addeq r2, r2, #16
beq 1f
@ -176,6 +177,7 @@ function idct_col4_neon
1: orrs r6, r6, r7
ldrd r4, [r2, #16]
it eq
addeq r2, r2, #16
beq 2f
@ -187,6 +189,7 @@ function idct_col4_neon
2: orrs r4, r4, r5
ldrd r4, [r2, #16]
it eq
addeq r2, r2, #16
beq 3f
@ -199,6 +202,7 @@ function idct_col4_neon
vadd.i32 q13, q13, q8
3: orrs r4, r4, r5
it eq
addeq r2, r2, #16
beq 4f

@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale
vst1.32 {q9}, [r2,:128]
subs r1, r1, #1
it eq
popeq {r4-r11,pc}
cmp r4, #0
itt eq
subeq r8, r8, #512*4
subeq r9, r9, #512*4
sub r5, r5, #512*4

@ -21,6 +21,14 @@
#ifndef AVCODEC_ARM_VP56_ARITH_H
#define AVCODEC_ARM_VP56_ARITH_H
#if CONFIG_THUMB
# define A(x)
# define T(x) x
#else
# define A(x) x
# define T(x)
#endif
#if HAVE_ARMV6 && HAVE_INLINE_ASM
#define vp56_rac_get_prob vp56_rac_get_prob_armv6
@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
unsigned bit;
__asm__ ("adds %3, %3, %0 \n"
"itt cs \n"
"cmpcs %7, %4 \n"
"ldrcsh %2, [%4], #2 \n"
A("ldrcsh %2, [%4], #2 \n")
T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
T("itttt cs \n")
"rev16cs %2, %2 \n"
"orrcs %1, %1, %2, lsl %3 \n"
T("lslcs %2, %2, %3 \n")
T("orrcs %1, %1, %2 \n")
A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"cmp %1, %0, lsl #16 \n"
"ittte ge \n"
"subge %1, %1, %0, lsl #16 \n"
"subge %0, %5, %0 \n"
"movge %2, #1 \n"
@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
unsigned tmp;
__asm__ ("adds %3, %3, %0 \n"
"itt cs \n"
"cmpcs %7, %4 \n"
"ldrcsh %2, [%4], #2 \n"
A("ldrcsh %2, [%4], #2 \n")
T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
T("itttt cs \n")
"rev16cs %2, %2 \n"
"orrcs %1, %1, %2, lsl %3 \n"
T("lslcs %2, %2, %3 \n")
T("orrcs %1, %1, %2 \n")
A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"lsl %2, %0, #16 \n"

@ -25,13 +25,18 @@
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
rsb \h, \pr, #256
it cs
ldrhcs \t1, [\buf], #2
smlabb \h, \t0, \pr, \h
T itttt cs
rev16cs \t1, \t1
orrcs \cw, \cw, \t1, lsl \bs
A orrcs \cw, \cw, \t1, lsl \bs
T lslcs \t1, \t1, \bs
T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
@ -40,14 +45,20 @@
adds \bs, \bs, \t0
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
it cs
ldrhcs \t1, [\buf], #2
mov \h, #128
it cs
rev16cs \t1, \t1
add \h, \h, \t0, lsl #7
orrcs \cw, \cw, \t1, lsl \bs
A orrcs \cw, \cw, \t1, lsl \bs
T ittt cs
T lslcs \t1, \t1, \bs
T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1
cmp r3, #0
ldr r11, [r5]
ldm r0, {r5-r7} @ high, bits, buf
it ne
pkhtbne r11, r11, r11, asr #16
ldr r8, [r0, #16] @ code_word
0:
@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1
adds r6, r6, r9
add r4, r4, #11
lsl r8, r8, r9
it cs
ldrhcs r10, [r7], #2
lsl r9, r5, r9
mov r5, #128
it cs
rev16cs r10, r10
add r5, r5, r9, lsl #7
orrcs r8, r8, r10, lsl r6
T ittt cs
T lslcs r10, r10, r6
T orrcs r8, r8, r10
A orrcs r8, r8, r10, lsl r6
subcs r6, r6, #16
lsr r5, r5, #8
cmp r8, r5, lsl #16
movrel r10, zigzag_scan-1
itt ge
subge r8, r8, r5, lsl #16
subge r5, r9, r5
ldrb r10, [r10, r3]
it ge
rsbge r12, r12, #0
cmp r3, #16
strh r12, [r1, r10]
@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1
ldr r0, [sp]
ldr r9, [r0, #12]
cmp r7, r9
it hi
movhi r7, r9
stm r0, {r5-r7} @ high, bits, buf
str r8, [r0, #16] @ code_word
@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #2
ldrb r0, [r4, #4]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, #1
ldrb r9, [lr, r5]
blt 4f
ldrb r0, [r4, #5]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, #1
ldrb r9, [lr, r5]
b 4f
@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #5
mov r0, #159
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #7
mov r0, #165
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #2
ldrb r9, [lr, r5]
mov r0, #145
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
3:
ldrb r0, [r4, #8]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r4, r4, #1
ldrb r9, [lr, r5]
ite ge
movge r12, #2
movlt r12, #0
ldrb r0, [r4, #9]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
mov r9, #8
it ge
addge r12, r12, #1
movrel r4, X(ff_vp8_dct_cat_prob)
lsl r9, r9, r12
@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1
lsl r1, r1, #1
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r0, [r4], #1
it ge
addge r1, r1, #1
cmp r0, #0
bne 1b
@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1
add r4, r2, r4
add r4, r4, #22
rac_get_128 r5, r6, r7, r8, r9, r10
it ge
rsbge r12, r12, #0
smulbb r12, r12, r11
movrel r9, zigzag_scan-1

@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1
push {r4-r6,lr}
1:
subs r12, r12, #4
ldr r4, [r2], r3
ldr r5, [r2], r3
ldr r6, [r2], r3
ldr lr, [r2], r3
str r4, [r0], r1
str r5, [r0], r1
str r6, [r0], r1
str lr, [r0], r1
ldr_post r4, r2, r3
ldr_post r5, r2, r3
ldr_post r6, r2, r3
ldr_post lr, r2, r3
str_post r4, r0, r1
str_post r5, r0, r1
str_post r6, r0, r1
str_post lr, r0, r1
bgt 1b
pop {r4-r6,pc}
endfunc

@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b)
int r;
__asm__ ("cmp %2, #2 \n\t"
"ldr %0, [%3, %2, lsl #2] \n\t"
"ite le \n\t"
"lsrle %0, %1, #1 \n\t"
"smmulgt %0, %0, %1 \n\t"
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a)
{
int x, y;
__asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t"
"itet ne \n\t"
"mvnne %1, #1<<31 \n\t"
"moveq %0, %Q2 \n\t"
"eorne %0, %1, %R2, asr #31 \n\t"

Loading…
Cancel
Save