x86/aacpsdsp: optimize ff_ps_mul_pair_single_sse

~2% faster.
pull/120/merge
James Almer 8 years ago
parent caf7d6178a
commit 933dd62288
  1. 21
      libavcodec/x86/aacpsdsp.asm

@ -62,24 +62,27 @@ PS_ADD_SQUARES 3
; float *src1, int n); ; float *src1, int n);
;******************************************************************* ;*******************************************************************
INIT_XMM sse INIT_XMM sse
cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
xor r4q, r4q shl nd, 3
add src1q, nq
add dstq, nq
neg nq
align 16
.loop: .loop:
movu m0, [src1q+r4q] movu m0, [src1q+nq]
movu m1, [src1q+r4q+mmsize] movu m1, [src1q+nq+mmsize]
mova m2, [src2q] mova m2, [src2q]
mova m3, m2 mova m3, m2
unpcklps m2, m2 unpcklps m2, m2
unpckhps m3, m3 unpckhps m3, m3
mulps m0, m2 mulps m0, m2
mulps m1, m3 mulps m1, m3
mova [dstq+r4q], m0 mova [dstq+nq], m0
mova [dstq+r4q+mmsize], m1 mova [dstq+nq+mmsize], m1
add src2q, mmsize add src2q, mmsize
add r4q, mmsize*2 add nq, mmsize*2
sub nd, mmsize/4 jl .loop
jg .loop
REP_RET REP_RET
;*********************************************************************** ;***********************************************************************

Loading…
Cancel
Save