FFmpeg/libavcodec/x86/mlpdsp.asm

;******************************************************************************
;* SIMD-optimized MLP DSP functions
;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

%if ARCH_X86_64

%macro SHLX 2
%if cpuflag(bmi2)
   shlx %1, %1, %2q
%else
   shl  %1, %2b
%endif
%endmacro

%macro REMATRIX 0
    movdqa        m0, [samplesq]
    movdqa        m1, [coeffsq ]
    pshufd        m2, m0, q2301
    pshufd        m3, m1, q2301
    pmuldq        m0, m1
    pmuldq        m3, m2
    paddq         m0, m3
%if notcpuflag(avx2)
    movdqa        m1, [samplesq + 16]
    movdqa        m2, [coeffsq  + 16]
    pshufd        m3, m1, q2301
    pshufd        m4, m2, q2301
    pmuldq        m1, m2
    pmuldq        m4, m3
    paddq         m0, m1
    paddq         m0, m4
%else
    vextracti128 xm1, m0, 1
    paddq        xm0, xm1
%endif
%endmacro

%macro LOOP_END 0
    pshufd       xm1, xm0, q0032
    paddq        xm0, xm1
    movq      accumq, xm0
    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
    sar       accumq, 14                            ; accum >>= 14
    and       accumd, maskd                         ; accum &= mask
    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
    add     samplesq, 32                            ; samples += MAX_CHANNELS;
    cmp   blsbs_ptrq, cntq
%endmacro

%macro LOOP_SHIFT_END 0
    pshufd       xm1, xm0, q0032
    paddq        xm0, xm1
    movq      accumq, xm0
    and       indexd, auspd                         ; index &= access_unit_size_pow2;
    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
    add       indexd, index2d                       ; index += index2
    SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
    add       accumq, noiseq                        ; accum += noise_buffer[index]
    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
    sar       accumq, 14                            ; accum >>= 14
    and       accumd, maskd                         ; accum &= mask
    add       accumd, noised                        ; accum += *bypassed_lsbs
    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
    add     samplesq, 32                            ; samples += MAX_CHANNELS;
    cmp   blsbs_ptrq, cntq
%endmacro

;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
;                             int index, unsigned int dest_ch, uint16_t blockpos,
;                             unsigned int maxchan, int matrix_noise_shift,
;                             int access_unit_size_pow2, int32_t mask)
%macro MLP_REMATRIX_CHANNEL 0
cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
                                        index, dest_ch, blockpos, maxchan, mns, \
                                        accum, mask, cnt
    mov         mnsd, mnsm                          ; load matrix_noise_shift
    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
    mov     maxchand, maxchanm                      ; load maxchan
    mov        maskd, maskm                         ; load mask
%if WIN64
    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
%endif
    shl     dest_chd, 2
    lea         cntq, [blsbs_ptrq + blockposq*8]
    test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
    jne .shift                                      ; jump if true
    cmp     maxchand, 4                             ; is maxchan < 4?
    jl .loop4                                       ; jump if true

align 16
.loop8:
    ; Process 5 or more channels
    REMATRIX
    LOOP_END
    jne .loop8
    RET

align 16
.loop4:
    ; Process up to 4 channels
    movdqa       xm0, [samplesq]
    movdqa       xm1, [coeffsq ]
    pshufd       xm2, xm0, q2301
    pshufd       xm3, xm1, q2301
    pmuldq       xm0, xm1
    pmuldq       xm3, xm2
    paddq        xm0, xm3
    LOOP_END
    jne .loop4
    RET

.shift:
%if WIN64
    mov       indexd, indexm         ; load index (not needed on UNIX64)
%endif
    mov          r9d, r9m            ; load access_unit_size_pow2
%if cpuflag(bmi2)
    ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
                index, dest_ch, accum, index2, mns, \
                ausp, mask, cnt, noise
    add         mnsd, 7              ; matrix_noise_shift += 7
%else ; sse4
    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
%if WIN64
    ; r0 = rcx
    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
                index2, accum, ausp, mask, cnt, noise
%else ; UNIX64
    ; r3 = rcx
    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
                index2, accum, ausp, mask, cnt, noise
%endif
    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
%endif ; cpuflag
    sub        auspd, 1              ; access_unit_size_pow2 -= 1
    cmp          r7d, 4              ; is maxchan < 4?
    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
    jl .loop4_shift                  ; jump if maxchan < 4

align 16
.loop8_shift:
    ; Process 5 or more channels
    REMATRIX
    LOOP_SHIFT_END
    jne .loop8_shift
    RET

align 16
.loop4_shift:
    ; Process up to 4 channels
    movdqa       xm0, [samplesq]
    movdqa       xm1, [coeffsq ]
    pshufd       xm2, xm0, q2301
    pshufd       xm3, xm1, q2301
    pmuldq       xm0, xm1
    pmuldq       xm3, xm2
    paddq        xm0, xm3
    LOOP_SHIFT_END
    jne .loop4_shift
    RET
%endmacro

INIT_XMM sse4
MLP_REMATRIX_CHANNEL
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2, bmi2
MLP_REMATRIX_CHANNEL
%endif

%endif ; ARCH_X86_64
x86/mlpdec: add ff_mlp_rematrix_channel_{sse4,avx2} 2x to 2.5x faster than the C version. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 11 years ago			`;******************************************************************************`
			`;* SIMD-optimized MLP DSP functions`
			`;* Copyright (c) 2014 James Almer <jamrial@gmail.com>`
			`;*`
			`;* This file is part of FFmpeg.`
			`;*`
			`;* FFmpeg is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* FFmpeg is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with FFmpeg; if not, write to the Free Software`
			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

			`%include "libavutil/x86/x86util.asm"`

x86inc: Drop SECTION_TEXT macro The .text section is already 16-byte aligned by default on all supported platforms so `SECTION_TEXT` isn't any different from `SECTION .text`. 10 years ago			`SECTION .text`
x86/mlpdec: add ff_mlp_rematrix_channel_{sse4,avx2} 2x to 2.5x faster than the C version. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 11 years ago
			`%if ARCH_X86_64`

			`%macro SHLX 2`
			`%if cpuflag(bmi2)`
			`shlx %1, %1, %2q`
			`%else`
			`shl %1, %2b`
			`%endif`
			`%endmacro`

			`%macro REMATRIX 0`
			`movdqa m0, [samplesq]`
			`movdqa m1, [coeffsq ]`
			`pshufd m2, m0, q2301`
			`pshufd m3, m1, q2301`
			`pmuldq m0, m1`
			`pmuldq m3, m2`
			`paddq m0, m3`
			`%if notcpuflag(avx2)`
			`movdqa m1, [samplesq + 16]`
			`movdqa m2, [coeffsq + 16]`
			`pshufd m3, m1, q2301`
			`pshufd m4, m2, q2301`
			`pmuldq m1, m2`
			`pmuldq m4, m3`
			`paddq m0, m1`
			`paddq m0, m4`
			`%else`
			`vextracti128 xm1, m0, 1`
			`paddq xm0, xm1`
			`%endif`
			`%endmacro`

			`%macro LOOP_END 0`
			`pshufd xm1, xm0, q0032`
			`paddq xm0, xm1`
			`movq accumq, xm0`
			`movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs`
			`sar accumq, 14 ; accum >>= 14`
			`and accumd, maskd ; accum &= mask`
			`add accumd, blsbsd ; accum += *bypassed_lsbs`
			`mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum`
			`add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;`
			`add samplesq, 32 ; samples += MAX_CHANNELS;`
			`cmp blsbs_ptrq, cntq`
			`%endmacro`

			`%macro LOOP_SHIFT_END 0`
			`pshufd xm1, xm0, q0032`
			`paddq xm0, xm1`
			`movq accumq, xm0`
			`and indexd, auspd ; index &= access_unit_size_pow2;`
			`movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]`
			`add indexd, index2d ; index += index2`
			`SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift`
			`add accumq, noiseq ; accum += noise_buffer[index]`
			`movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register)`
			`sar accumq, 14 ; accum >>= 14`
			`and accumd, maskd ; accum &= mask`
			`add accumd, noised ; accum += *bypassed_lsbs`
			`mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum`
			`add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;`
			`add samplesq, 32 ; samples += MAX_CHANNELS;`
			`cmp blsbs_ptrq, cntq`
			`%endmacro`

			`;void ff_mlp_rematrix_channel(int32_t samples, const int32_t coeffs,`
			`; const uint8_t bypassed_lsbs, const int8_t noise_buffer,`
			`; int index, unsigned int dest_ch, uint16_t blockpos,`
			`; unsigned int maxchan, int matrix_noise_shift,`
			`; int access_unit_size_pow2, int32_t mask)`
			`%macro MLP_REMATRIX_CHANNEL 0`
			`cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \`
			`index, dest_ch, blockpos, maxchan, mns, \`
			`accum, mask, cnt`
			`mov mnsd, mnsm ; load matrix_noise_shift`
			`movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)`
			`mov maxchand, maxchanm ; load maxchan`
			`mov maskd, maskm ; load mask`
			`%if WIN64`
			`mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)`
			`%endif`
			`shl dest_chd, 2`
			`lea cntq, [blsbs_ptrq + blockposq*8]`
			`test mnsd, mnsd ; is matrix_noise_shift != 0?`
			`jne .shift ; jump if true`
			`cmp maxchand, 4 ; is maxchan < 4?`
			`jl .loop4 ; jump if true`

			`align 16`
			`.loop8:`
			`; Process 5 or more channels`
			`REMATRIX`
			`LOOP_END`
			`jne .loop8`
			`RET`

			`align 16`
			`.loop4:`
			`; Process up to 4 channels`
			`movdqa xm0, [samplesq]`
			`movdqa xm1, [coeffsq ]`
			`pshufd xm2, xm0, q2301`
			`pshufd xm3, xm1, q2301`
			`pmuldq xm0, xm1`
			`pmuldq xm3, xm2`
			`paddq xm0, xm3`
			`LOOP_END`
			`jne .loop4`
			`RET`

			`.shift:`
			`%if WIN64`
			`mov indexd, indexm ; load index (not needed on UNIX64)`
			`%endif`
			`mov r9d, r9m ; load access_unit_size_pow2`
			`%if cpuflag(bmi2)`
			`; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.`
			`DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \`
			`index, dest_ch, accum, index2, mns, \`
			`ausp, mask, cnt, noise`
			`add mnsd, 7 ; matrix_noise_shift += 7`
			`%else ; sse4`
			`mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift`
			`%if WIN64`
			`; r0 = rcx`
			`DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \`
			`index2, accum, ausp, mask, cnt, noise`
			`%else ; UNIX64`
			`; r3 = rcx`
			`DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \`
			`index2, accum, ausp, mask, cnt, noise`
			`%endif`
			`lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7`
			`%endif ; cpuflag`
			`sub auspd, 1 ; access_unit_size_pow2 -= 1`
			`cmp r7d, 4 ; is maxchan < 4?`
			`lea index2q, [indexq2 + 1] ; index2 = 2 index + 1;`
			`jl .loop4_shift ; jump if maxchan < 4`

			`align 16`
			`.loop8_shift:`
			`; Process 5 or more channels`
			`REMATRIX`
			`LOOP_SHIFT_END`
			`jne .loop8_shift`
			`RET`

			`align 16`
			`.loop4_shift:`
			`; Process up to 4 channels`
			`movdqa xm0, [samplesq]`
			`movdqa xm1, [coeffsq ]`
			`pshufd xm2, xm0, q2301`
			`pshufd xm3, xm1, q2301`
			`pmuldq xm0, xm1`
			`pmuldq xm3, xm2`
			`paddq xm0, xm3`
			`LOOP_SHIFT_END`
			`jne .loop4_shift`
			`RET`
			`%endmacro`

			`INIT_XMM sse4`
			`MLP_REMATRIX_CHANNEL`
			`%if HAVE_AVX2_EXTERNAL`
			`INIT_YMM avx2, bmi2`
			`MLP_REMATRIX_CHANNEL`
			`%endif`

			`%endif ; ARCH_X86_64`