x86: simple_idct10_template: fix overflow in pass

When the input of a pass has 15 or 16 bits of precision (in particular
the column pass), the addition of a bias to W4 may lead to overflows
in the input to pmaddwd.

This requires postponing the adding of the bias to after the first
butterfly. To do so, the fact that m15, unused although zeroed, is
exploited. In case the pass is safe, an address can be directly used,
and the number of xmm regs can be decreased. Otherwise, the 32bits bias
is loaded into it.

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
pull/154/head
Christophe Gisquet 9 years ago committed by Michael Niedermayer
parent 3b336ec2fb
commit e652f69b35
  1. 8
      libavcodec/x86/proresdsp.asm
  2. 13
      libavcodec/x86/simple_idct10_template.asm

@ -37,17 +37,17 @@ cextern pw_1019
section .text align=16
%macro idct_put_fn 1
cglobal prores_idct_put_10, 4, 4, %1
%macro idct_put_fn 0
cglobal prores_idct_put_10, 4, 4, 15
IDCT_PUT_FN pw_1, 15, pw_88, 18, pw_4, pw_1019, r3
RET
%endmacro
INIT_XMM sse2
idct_put_fn 16
idct_put_fn
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
idct_put_fn 16
idct_put_fn
%endif
%endif

@ -75,6 +75,7 @@ cextern w7_min_w5
; a2 -= W6 * row[2];
; a3 -= W2 * row[2];
%ifstr %1
mova m15, [pd_round_ %+ %2]
%else
paddw m10, [%1]
%endif
@ -87,6 +88,17 @@ cextern w7_min_w5
pmaddwd m7, m1, [w4_min_w2]
pmaddwd m0, [w4_plus_w2]
pmaddwd m1, [w4_plus_w2]
%ifstr %1
; Adding 1<<(%2-1) for >=15 bits values
paddd m2, m15
paddd m3, m15
paddd m4, m15
paddd m5, m15
paddd m6, m15
paddd m7, m15
paddd m0, m15
paddd m1, m15
%endif
; a0: -1*row[0]-1*row[2]
; a1: -1*row[0]
@ -225,7 +237,6 @@ cextern w7_min_w5
%macro IDCT_PUT_FN 6-7
movsxd r1, r1d
pxor m15, m15 ; zero
; for (i = 0; i < 8; i++)
; idctRowCondDC(block + i*8);

Loading…
Cancel
Save