FFmpeg/libavcodec/x86/sbrdsp.asm

;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask         times 2 dd 1<<31, 0
ps_neg          times 4 dd 1<<31

SECTION_TEXT

INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
    mov         r2, r1
    xorps       m0, m0
    xorps       m1, m1
    sar         r2, 3
    jz          .prepare
.loop:
    movu        m2, [r0 +  0]
    movu        m3, [r0 + 16]
    movu        m4, [r0 + 32]
    movu        m5, [r0 + 48]
    mulps       m2, m2
    mulps       m3, m3
    mulps       m4, m4
    mulps       m5, m5
    addps       m0, m2
    addps       m1, m3
    addps       m0, m4
    addps       m1, m5
    add         r0, 64
    dec         r2
    jnz         .loop
.prepare:
    and         r1, 7
    sar         r1, 1
    jz          .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
    movu        m2, [r0]
    add         r0, 16
    mulps       m2, m2
    dec         r1
    addps       m0, m2
    jnz         .endloop
.end:
    addps       m0, m1
    movhlps     m2, m0
    addps       m0, m2
    movss       m1, m0
    shufps      m0, m0, 1
    addss       m0, m1
%if ARCH_X86_64 == 0
    movss       r0m,  m0
    fld         dword r0m
%endif
    RET

%define STEP  40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
    lea         r1, [r1 + 8*r4] ; offset by ixh elements into X_high
    mov         r5, r3
    and         r3, 0xFC
    lea         r2, [r2 + r3*4]
    lea         r0, [r0 + r3*8]
    neg         r3
    jz          .loop1
.loop4:
    movlps      m0, [r2 + 4*r3 + 0]
    movlps      m1, [r2 + 4*r3 + 8]
    movlps      m2, [r1 + 0*STEP]
    movlps      m3, [r1 + 2*STEP]
    movhps      m2, [r1 + 1*STEP]
    movhps      m3, [r1 + 3*STEP]
    unpcklps    m0, m0
    unpcklps    m1, m1
    mulps       m0, m2
    mulps       m1, m3
    movu        [r0 + 8*r3 +  0], m0
    movu        [r0 + 8*r3 + 16], m1
    add         r1, 4*STEP
    add         r3, 4
    jnz         .loop4
    and         r5, 3 ; number of single element loops
    jz          .end
.loop1: ; element 0 and 1 can be computed at the same time
    movss       m0, [r2]
    movlps      m2, [r1]
    unpcklps    m0, m0
    mulps       m2, m0
    movlps    [r0], m2
    add         r0, 8
    add         r2, 4
    add         r1, STEP
    dec         r5
    jnz         .loop1
.end:
    RET

; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
;                          const float alpha0[2], const float alpha1[2],
;                          float bw, int start, int end)
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
    ; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
    movss      bw, BWm
%endif
    movlps     m2, [alpha1q]
    movlps     m1, [alpha0q]
    shufps     bw, bw, 0
    mulps      m2, bw             ; (a1[0] a1[1])*bw
    mulps      m1, bw             ; (a0[0] a0[1])*bw    = (a2 a3)
    mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
    mova       m3, m1
    mova       m4, m2

    ; Set pointers
%if ARCH_X86_64 == 0 || WIN64
    ; start and end 6th and 7th args on stack
    mov        r2d, Sm
    mov        r3d, Em
%define  start r2q
%define  end   r3q
%else
; BW does not actually occupy a register, so shift by 1
%define  start BWq
%define  end   Sq
%endif
    sub      start, end          ; neg num of loops
    lea    X_highq, [X_highq + end*2*4]
    lea     X_lowq, [X_lowq  + end*2*4 - 2*2*4]
    shl      start, 3            ; offset from num loops

    mova        m0, [X_lowq + start]
    shufps      m3, m3, q1111
    shufps      m4, m4, q1111
    xorps       m3, [ps_mask]
    shufps      m1, m1, q0000
    shufps      m2, m2, q0000
    xorps       m4, [ps_mask]
.loop2:
    movu        m7, [X_lowq + start + 8]        ; BbCc
    mova        m6, m0
    mova        m5, m7
    shufps      m0, m0, q2301                   ; aAbB
    shufps      m7, m7, q2301                   ; bBcC
    mulps       m0, m4
    mulps       m7, m3
    mulps       m6, m2
    mulps       m5, m1
    addps       m7, m0
    mova        m0, [X_lowq + start +16]        ; CcDd
    addps       m7, m0
    addps       m6, m5
    addps       m7, m6
    mova  [X_highq + start], m7
    add     start, 16
    jnz         .loop2
    RET

cglobal sbr_sum64x5, 1,2,4,z
    lea    r1q, [zq+ 256]
.loop:
    mova    m0, [zq+   0]
    mova    m2, [zq+  16]
    mova    m1, [zq+ 256]
    mova    m3, [zq+ 272]
    addps   m0, [zq+ 512]
    addps   m2, [zq+ 528]
    addps   m1, [zq+ 768]
    addps   m3, [zq+ 784]
    addps   m0, [zq+1024]
    addps   m2, [zq+1040]
    addps   m0, m1
    addps   m2, m3
    mova  [zq], m0
    mova  [zq+16], m2
    add     zq, 32
    cmp     zq, r1q
    jne  .loop
    REP_RET

INIT_XMM sse
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
    lea              r2q, [zq + (64-4)*4]
    mova              m3, [ps_neg]
.loop:
    mova              m1, [zq]
    xorps             m0, m3, [r2q]
    shufps            m0, m0, m0, q0123
    unpcklps          m2, m0, m1
    unpckhps          m0, m0, m1
    mova       [Wq +  0], m2
    mova       [Wq + 16], m0
    add               Wq, 32
    sub              r2q, 16
    add               zq, 16
    cmp               zq, r2q
    jl             .loop
    REP_RET
SBR DSP x86: implement SSE sbr_sum_square_sse The 32bits targets have been compiled with -mfpmath=sse for proper reference. sbr_sum_square C /32bits: 82c (unrolled)/102c C /64bits: 69c (unrolled)/82c SSE/32bits: 42c SSE/64bits: 31c Use of SSE4.1 dpps to perform the final sum is slower. Not unrolling to perform 8 operations in a loop yields 10 more cycles. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`;******************************************************************************`
			`;* AAC Spectral Band Replication decoding functions`
			`;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>`
			`;*`
			`;* This file is part of Libav.`
			`;*`
			`;* Libav is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* Libav is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with Libav; if not, write to the Free Software`
			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

x86: yasm: Use complete source path for macro helper %includes This is more consistent with the way we handle C #includes and it simplifies the build system. 13 years ago			`%include "libavutil/x86/x86util.asm"`
SBR DSP x86: implement SSE sbr_sum_square_sse The 32bits targets have been compiled with -mfpmath=sse for proper reference. sbr_sum_square C /32bits: 82c (unrolled)/102c C /64bits: 69c (unrolled)/82c SSE/32bits: 42c SSE/64bits: 31c Use of SSE4.1 dpps to perform the final sum is slower. Not unrolling to perform 8 operations in a loop yields 10 more cycles. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago
SBR DSP x86: implement SSE sbr_hf_gen Start and end index are multiple of 2, therefore guaranteeing aligned access. Also, this allows to generate 4 floats per loop, keeping the alignment all along. Timing: - 32 bits: 326c -> 172c - 64 bits: 323c -> 156c Signed-off-by: Diego Biurrun <diego@biurrun.de> 13 years ago			`SECTION_RODATA`
			`; mask equivalent for multiply by -1.0 1.0`
			`ps_mask times 2 dd 1<<31, 0`
x86: sbrdsp: Implement SSE qmf_post_shuffle 255 to 174 cycles on Arrandale / Win64. Unrolling yields no gain. Signed-off-by: Diego Biurrun <diego@biurrun.de> 12 years ago			`ps_neg times 4 dd 1<<31`
SBR DSP x86: implement SSE sbr_hf_gen Start and end index are multiple of 2, therefore guaranteeing aligned access. Also, this allows to generate 4 floats per loop, keeping the alignment all along. Timing: - 32 bits: 326c -> 172c - 64 bits: 323c -> 156c Signed-off-by: Diego Biurrun <diego@biurrun.de> 13 years ago
			`SECTION_TEXT`
SBR DSP x86: implement SSE sbr_sum_square_sse The 32bits targets have been compiled with -mfpmath=sse for proper reference. sbr_sum_square C /32bits: 82c (unrolled)/102c C /64bits: 69c (unrolled)/82c SSE/32bits: 42c SSE/64bits: 31c Use of SSE4.1 dpps to perform the final sum is slower. Not unrolling to perform 8 operations in a loop yields 10 more cycles. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago
			`INIT_XMM sse`
			`cglobal sbr_sum_square, 2, 3, 6`
			`mov r2, r1`
			`xorps m0, m0`
			`xorps m1, m1`
			`sar r2, 3`
			`jz .prepare`
			`.loop:`
			`movu m2, [r0 + 0]`
			`movu m3, [r0 + 16]`
			`movu m4, [r0 + 32]`
			`movu m5, [r0 + 48]`
			`mulps m2, m2`
			`mulps m3, m3`
			`mulps m4, m4`
			`mulps m5, m5`
			`addps m0, m2`
			`addps m1, m3`
			`addps m0, m4`
			`addps m1, m5`
			`add r0, 64`
			`dec r2`
			`jnz .loop`
			`.prepare:`
			`and r1, 7`
			`sar r1, 1`
			`jz .end`
			`; len is a multiple of 2, thus there are at least 4 elements to process`
			`.endloop:`
			`movu m2, [r0]`
			`add r0, 16`
			`mulps m2, m2`
			`dec r1`
			`addps m0, m2`
			`jnz .endloop`
			`.end:`
			`addps m0, m1`
			`movhlps m2, m0`
			`addps m0, m2`
			`movss m1, m0`
			`shufps m0, m0, 1`
			`addss m0, m1`
			`%if ARCH_X86_64 == 0`
dsputil x86: use SSE float instruction instead of SSE2 integer equivalent All the more required since the users are pure SSE functions. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`movss r0m, m0`
SBR DSP x86: implement SSE sbr_sum_square_sse The 32bits targets have been compiled with -mfpmath=sse for proper reference. sbr_sum_square C /32bits: 82c (unrolled)/102c C /64bits: 69c (unrolled)/82c SSE/32bits: 42c SSE/64bits: 31c Use of SSE4.1 dpps to perform the final sum is slower. Not unrolling to perform 8 operations in a loop yields 10 more cycles. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`fld dword r0m`
			`%endif`
			`RET`
SBR DSP x86: implement SSE sbr_hf_g_filt Unrolling the main loop to process, instead of 4 elements: - 8: minor gain of 2 cycles (not worth the extra object size) - 2: loss of 8 cycles. Assigning STEP to a register is a loss. Output address (Y) is almost always unaligned. Timings: - C (32/64 bits): 117/109 cycles - SSE: 57 cycles Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago
			`%define STEP 4042`
			`cglobal sbr_hf_g_filt, 5, 6, 5`
			`lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high`
			`mov r5, r3`
			`and r3, 0xFC`
			`lea r2, [r2 + r3*4]`
			`lea r0, [r0 + r3*8]`
			`neg r3`
aacsbr: handle m_max values smaller than 4. Prevents a signflip in the counter, and a subsequent crash because of overreads/overwrites. Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind CC: libav-stable@libav.org 13 years ago			`jz .loop1`
SBR DSP x86: implement SSE sbr_hf_g_filt Unrolling the main loop to process, instead of 4 elements: - 8: minor gain of 2 cycles (not worth the extra object size) - 2: loss of 8 cycles. Assigning STEP to a register is a loss. Output address (Y) is almost always unaligned. Timings: - C (32/64 bits): 117/109 cycles - SSE: 57 cycles Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`.loop4:`
sbrdsp.asm: convert all instructions to float/SSE ones. Since the values are floats, using the float operations makes sense, improves performance on some CPUs and makes the code SSE compatible instead of needing SSE2. Based on suggestion by Jason. Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`movlps m0, [r2 + 4*r3 + 0]`
			`movlps m1, [r2 + 4*r3 + 8]`
			`movlps m2, [r1 + 0*STEP]`
			`movlps m3, [r1 + 2*STEP]`
SBR DSP x86: implement SSE sbr_hf_g_filt Unrolling the main loop to process, instead of 4 elements: - 8: minor gain of 2 cycles (not worth the extra object size) - 2: loss of 8 cycles. Assigning STEP to a register is a loss. Output address (Y) is almost always unaligned. Timings: - C (32/64 bits): 117/109 cycles - SSE: 57 cycles Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`movhps m2, [r1 + 1*STEP]`
			`movhps m3, [r1 + 3*STEP]`
sbrdsp.asm: convert all instructions to float/SSE ones. Since the values are floats, using the float operations makes sense, improves performance on some CPUs and makes the code SSE compatible instead of needing SSE2. Based on suggestion by Jason. Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`unpcklps m0, m0`
			`unpcklps m1, m1`
SBR DSP x86: implement SSE sbr_hf_g_filt Unrolling the main loop to process, instead of 4 elements: - 8: minor gain of 2 cycles (not worth the extra object size) - 2: loss of 8 cycles. Assigning STEP to a register is a loss. Output address (Y) is almost always unaligned. Timings: - C (32/64 bits): 117/109 cycles - SSE: 57 cycles Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`mulps m0, m2`
			`mulps m1, m3`
			`movu [r0 + 8*r3 + 0], m0`
			`movu [r0 + 8*r3 + 16], m1`
			`add r1, 4*STEP`
			`add r3, 4`
			`jnz .loop4`
			`and r5, 3 ; number of single element loops`
			`jz .end`
			`.loop1: ; element 0 and 1 can be computed at the same time`
			`movss m0, [r2]`
sbrdsp.asm: convert all instructions to float/SSE ones. Since the values are floats, using the float operations makes sense, improves performance on some CPUs and makes the code SSE compatible instead of needing SSE2. Based on suggestion by Jason. Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`movlps m2, [r1]`
			`unpcklps m0, m0`
SBR DSP x86: implement SSE sbr_hf_g_filt Unrolling the main loop to process, instead of 4 elements: - 8: minor gain of 2 cycles (not worth the extra object size) - 2: loss of 8 cycles. Assigning STEP to a register is a loss. Output address (Y) is almost always unaligned. Timings: - C (32/64 bits): 117/109 cycles - SSE: 57 cycles Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`mulps m2, m0`
SBR DSP: fix SSE code to not use SSE2 instructions. movq from SSE register _to_ memory is an SSE2 instruction. Use the SSE movlps function instead that does the same thing. Signed-off-by: Reimar DÃ¶ffinger <Reimar.Doeffinger@gmx.de> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`movlps [r0], m2`
SBR DSP x86: implement SSE sbr_hf_g_filt Unrolling the main loop to process, instead of 4 elements: - 8: minor gain of 2 cycles (not worth the extra object size) - 2: loss of 8 cycles. Assigning STEP to a register is a loss. Output address (Y) is almost always unaligned. Timings: - C (32/64 bits): 117/109 cycles - SSE: 57 cycles Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> 13 years ago			`add r0, 8`
			`add r2, 4`
			`add r1, STEP`
			`dec r5`
			`jnz .loop1`
			`.end:`
			`RET`
SBR DSP x86: implement SSE sbr_hf_gen Start and end index are multiple of 2, therefore guaranteeing aligned access. Also, this allows to generate 4 floats per loop, keeping the alignment all along. Timing: - 32 bits: 326c -> 172c - 64 bits: 323c -> 156c Signed-off-by: Diego Biurrun <diego@biurrun.de> 13 years ago
			`; static void sbr_hf_gen_c(float (X_high)[2], const float (X_low)[2],`
			`; const float alpha0[2], const float alpha1[2],`
			`; float bw, int start, int end)`
			`;`
			`cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E`
			`; load alpha factors`
			`%define bw m0`
			`%if ARCH_X86_64 == 0 \|\| WIN64`
			`movss bw, BWm`
			`%endif`
			`movlps m2, [alpha1q]`
			`movlps m1, [alpha0q]`
			`shufps bw, bw, 0`
			`mulps m2, bw ; (a1[0] a1[1])*bw`
			`mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)`
			`mulps m2, bw ; (a1[0] a1[1])bwbw = (a0 a1)`
			`mova m3, m1`
			`mova m4, m2`

			`; Set pointers`
			`%if ARCH_X86_64 == 0 \|\| WIN64`
			`; start and end 6th and 7th args on stack`
			`mov r2d, Sm`
			`mov r3d, Em`
			`%define start r2q`
			`%define end r3q`
			`%else`
			`; BW does not actually occupy a register, so shift by 1`
			`%define start BWq`
			`%define end Sq`
			`%endif`
			`sub start, end ; neg num of loops`
			`lea X_highq, [X_highq + end24]`
			`lea X_lowq, [X_lowq + end24 - 224]`
			`shl start, 3 ; offset from num loops`

			`mova m0, [X_lowq + start]`
sbr_hf_gen_sse: Optimize code a bit more. Core I7 (Sandy Bridge) 135 to 107 cycles Core i5 (Arrandale) 162 to 142 (Thanks to Christophe Gisquet for testing) Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 12 years ago			`shufps m3, m3, q1111`
			`shufps m4, m4, q1111`
			`xorps m3, [ps_mask]`
			`shufps m1, m1, q0000`
			`shufps m2, m2, q0000`
			`xorps m4, [ps_mask]`
SBR DSP x86: implement SSE sbr_hf_gen Start and end index are multiple of 2, therefore guaranteeing aligned access. Also, this allows to generate 4 floats per loop, keeping the alignment all along. Timing: - 32 bits: 326c -> 172c - 64 bits: 323c -> 156c Signed-off-by: Diego Biurrun <diego@biurrun.de> 13 years ago			`.loop2:`
sbr_hf_gen_sse: Optimize code a bit more. Core I7 (Sandy Bridge) 135 to 107 cycles Core i5 (Arrandale) 162 to 142 (Thanks to Christophe Gisquet for testing) Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 12 years ago			`movu m7, [X_lowq + start + 8] ; BbCc`
SBR DSP x86: implement SSE sbr_hf_gen Start and end index are multiple of 2, therefore guaranteeing aligned access. Also, this allows to generate 4 floats per loop, keeping the alignment all along. Timing: - 32 bits: 326c -> 172c - 64 bits: 323c -> 156c Signed-off-by: Diego Biurrun <diego@biurrun.de> 13 years ago			`mova m6, m0`
sbr_hf_gen_sse: Optimize code a bit more. Core I7 (Sandy Bridge) 135 to 107 cycles Core i5 (Arrandale) 162 to 142 (Thanks to Christophe Gisquet for testing) Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 12 years ago			`mova m5, m7`
			`shufps m0, m0, q2301 ; aAbB`
			`shufps m7, m7, q2301 ; bBcC`
			`mulps m0, m4`
SBR DSP x86: implement SSE sbr_hf_gen Start and end index are multiple of 2, therefore guaranteeing aligned access. Also, this allows to generate 4 floats per loop, keeping the alignment all along. Timing: - 32 bits: 326c -> 172c - 64 bits: 323c -> 156c Signed-off-by: Diego Biurrun <diego@biurrun.de> 13 years ago			`mulps m7, m3`
sbr_hf_gen_sse: Optimize code a bit more. Core I7 (Sandy Bridge) 135 to 107 cycles Core i5 (Arrandale) 162 to 142 (Thanks to Christophe Gisquet for testing) Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 12 years ago			`mulps m6, m2`
			`mulps m5, m1`
			`addps m7, m0`
			`mova m0, [X_lowq + start +16] ; CcDd`
SBR DSP x86: implement SSE sbr_hf_gen Start and end index are multiple of 2, therefore guaranteeing aligned access. Also, this allows to generate 4 floats per loop, keeping the alignment all along. Timing: - 32 bits: 326c -> 172c - 64 bits: 323c -> 156c Signed-off-by: Diego Biurrun <diego@biurrun.de> 13 years ago			`addps m7, m0`
sbr_hf_gen_sse: Optimize code a bit more. Core I7 (Sandy Bridge) 135 to 107 cycles Core i5 (Arrandale) 162 to 142 (Thanks to Christophe Gisquet for testing) Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 12 years ago			`addps m6, m5`
			`addps m7, m6`
			`mova [X_highq + start], m7`
SBR DSP x86: implement SSE sbr_hf_gen Start and end index are multiple of 2, therefore guaranteeing aligned access. Also, this allows to generate 4 floats per loop, keeping the alignment all along. Timing: - 32 bits: 326c -> 172c - 64 bits: 323c -> 156c Signed-off-by: Diego Biurrun <diego@biurrun.de> 13 years ago			`add start, 16`
			`jnz .loop2`
			`RET`
x86: sbrdsp: Implement SSE sum64x5 698 to 174 cycles on Arrandale. Unrolling is a 6 cycles gain. Signed-off-by: Diego Biurrun <diego@biurrun.de> 12 years ago
			`cglobal sbr_sum64x5, 1,2,4,z`
			`lea r1q, [zq+ 256]`
			`.loop:`
			`mova m0, [zq+ 0]`
			`mova m2, [zq+ 16]`
			`mova m1, [zq+ 256]`
			`mova m3, [zq+ 272]`
			`addps m0, [zq+ 512]`
			`addps m2, [zq+ 528]`
			`addps m1, [zq+ 768]`
			`addps m3, [zq+ 784]`
			`addps m0, [zq+1024]`
			`addps m2, [zq+1040]`
			`addps m0, m1`
			`addps m2, m3`
			`mova [zq], m0`
			`mova [zq+16], m2`
			`add zq, 32`
			`cmp zq, r1q`
			`jne .loop`
			`REP_RET`
x86: sbrdsp: Implement SSE qmf_post_shuffle 255 to 174 cycles on Arrandale / Win64. Unrolling yields no gain. Signed-off-by: Diego Biurrun <diego@biurrun.de> 12 years ago
			`INIT_XMM sse`
			`cglobal sbr_qmf_post_shuffle, 2,3,4,W,z`
			`lea r2q, [zq + (64-4)*4]`
			`mova m3, [ps_neg]`
			`.loop:`
			`mova m1, [zq]`
			`xorps m0, m3, [r2q]`
			`shufps m0, m0, m0, q0123`
			`unpcklps m2, m0, m1`
			`unpckhps m0, m0, m1`
			`mova [Wq + 0], m2`
			`mova [Wq + 16], m0`
			`add Wq, 32`
			`sub r2q, 16`
			`add zq, 16`
			`cmp zq, r2q`
			`jl .loop`
			`REP_RET`