FFmpeg/libavcodec/x86/huffyuvdsp.asm

;******************************************************************************
;* SIMD-optimized HuffYUV functions
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2014 Christophe Gisquet
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

%include "libavcodec/x86/huffyuvdsp_template.asm"

;------------------------------------------------------------------------------
; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
;------------------------------------------------------------------------------

%macro ADD_INT16 0
cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
    test srcq, mmsize-1
    jnz .unaligned
    test dstq, mmsize-1
    jnz .unaligned
    INT16_LOOP a, add
.unaligned:
    INT16_LOOP u, add
%endmacro

INIT_XMM sse2
ADD_INT16

%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_INT16
%endif

; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
;                               intptr_t w, uint8_t *left)
INIT_XMM sse2
cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
    shl           wq, 2
    movd          m0, [leftq]
    lea         dstq, [dstq + wq]
    lea         srcq, [srcq + wq]
    LSHIFT        m0, mmsize-4
    neg           wq
.loop:
    movu          m1, [srcq+wq]
    mova          m2, m1
    LSHIFT        m1, 4
    paddb         m1, m2
    pshufd        m0, m0, q3333
    mova          m2, m1
    LSHIFT        m1, 8
    paddb         m1, m2
    paddb         m0, m1
    movu   [dstq+wq], m0
    add           wq, mmsize
    jl         .loop
    movd          m0, [dstq-4]
    movd     [leftq], m0
    RET


; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
INIT_MMX mmxext
cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
    add      wd, wd
    movd    mm6, maskd
    SPLATW  mm6, mm6
    movq    mm0, [topq]
    movq    mm2, mm0
    movd    mm4, [left_topq]
    psllq   mm2, 16
    movq    mm1, mm0
    por     mm4, mm2
    movd    mm3, [leftq]
    psubw   mm0, mm4 ; t-tl
    add    dstq, wq
    add    topq, wq
    add   diffq, wq
    neg      wq
    jmp .skip
.loop:
    movq    mm4, [topq+wq]
    movq    mm0, mm4
    psllq   mm4, 16
    por     mm4, mm1
    movq    mm1, mm0 ; t
    psubw   mm0, mm4 ; t-tl
.skip:
    movq    mm2, [diffq+wq]
%assign i 0
%rep 4
    movq    mm4, mm0
    paddw   mm4, mm3 ; t-tl+l
    pand    mm4, mm6
    movq    mm5, mm3
    pmaxsw  mm3, mm1
    pminsw  mm5, mm1
    pminsw  mm3, mm4
    pmaxsw  mm3, mm5 ; median
    paddw   mm3, mm2 ; +residual
    pand    mm3, mm6
%if i==0
    movq    mm7, mm3
    psllq   mm7, 48
%else
    movq    mm4, mm3
    psrlq   mm7, 16
    psllq   mm4, 48
    por     mm7, mm4
%endif
%if i<3
    psrlq   mm0, 16
    psrlq   mm1, 16
    psrlq   mm2, 16
%endif
%assign i i+1
%endrep
    movq [dstq+wq], mm7
    add      wq, 8
    jl .loop
    movzx   r2d, word [dstq-2]
    mov [leftq], r2d
    movzx   r2d, word [topq-2]
    mov [left_topq], r2d
    RET
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 11 years ago			`;******************************************************************************`
			`;* SIMD-optimized HuffYUV functions`
			`;* Copyright (c) 2008 Loren Merritt`
x86: huffyuvdsp: add_hfyu_left_pred_bgr32 C MMX SSE2 Cycles: 3092 1053 578 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* Copyright (c) 2014 Christophe Gisquet`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 11 years ago			`;*`
Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d' * commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* This file is part of FFmpeg.`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 11 years ago			`;*`
Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d' * commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* FFmpeg is free software; you can redistribute it and/or`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 11 years ago			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d' * commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* FFmpeg is distributed in the hope that it will be useful,`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 11 years ago			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
Merge commit '0d439fbede03854eac8a978cccf21a3425a3c82d' * commit '0d439fbede03854eac8a978cccf21a3425a3c82d': dsputil: Split off HuffYUV decoding bits into their own context Conflicts: configure libavcodec/dsputil.c libavcodec/dsputil.h libavcodec/huffyuv.h libavcodec/huffyuvdec.c libavcodec/lagarith.c libavcodec/vble.c libavcodec/x86/Makefile libavcodec/x86/dsputil.asm libavcodec/x86/dsputil_init.c libavcodec/x86/dsputil_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`;* License along with FFmpeg; if not, write to the Free Software`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 11 years ago			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

			`%include "libavutil/x86/x86util.asm"`

x86inc: Drop SECTION_TEXT macro The .text section is already 16-byte aligned by default on all supported platforms so `SECTION_TEXT` isn't any different from `SECTION .text`. 10 years ago			`SECTION .text`
dsputil: Split off HuffYUV decoding bits into their own context Also shorten HuffYUV context member names to avoid clutter. 11 years ago
avcodec/huffyuvdsp(enc) : move duplicate macro to a template file 7 years ago			`%include "libavcodec/x86/huffyuvdsp_template.asm"`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 8 years ago
avcodec/huffyuvdsp : reorganize add_int16 asm 7 years ago			`;------------------------------------------------------------------------------`
			`; void (add_int16)(uint16_t dst, const uint16_t *src, unsigned mask, int w);`
			`;------------------------------------------------------------------------------`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 8 years ago
avcodec/huffyuvdsp : reorganize add_int16 asm 7 years ago			`%macro ADD_INT16 0`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 8 years ago			`cglobal add_int16, 4,4,5, dst, src, mask, w, tmp`
			`test srcq, mmsize-1`
			`jnz .unaligned`
			`test dstq, mmsize-1`
			`jnz .unaligned`
			`INT16_LOOP a, add`
			`.unaligned:`
			`INT16_LOOP u, add`
avcodec/huffyuvdsp : reorganize add_int16 asm 7 years ago			`%endmacro`

			`INIT_XMM sse2`
			`ADD_INT16`
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 8 years ago
avcodec/huffyuvdsp : add add_int16 AVX2 func 7 years ago			`%if HAVE_AVX2_EXTERNAL`
			`INIT_YMM avx2`
			`ADD_INT16`
			`%endif`

x86: huffyuvdsp: add_hfyu_left_pred_bgr32 C MMX SSE2 Cycles: 3092 1053 578 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`; void add_hfyu_left_pred_bgr32(uint8_t dst, const uint8_t src,`
			`; intptr_t w, uint8_t *left)`
avcodec/x86/huffyuvdsp: Remove obsolete MMX functions The only systems which benefit from these are truely ancient 32bit x86s as all other systems use at least the SSE2 versions (this includes all x64 cpus (which is why this code is restricted to x86-32)). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> 3 years ago			`INIT_XMM sse2`
x86: huffyuvdsp: add_hfyu_left_pred_bgr32 C MMX SSE2 Cycles: 3092 1053 578 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago			`cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left`
			`shl wq, 2`
			`movd m0, [leftq]`
			`lea dstq, [dstq + wq]`
			`lea srcq, [srcq + wq]`
			`LSHIFT m0, mmsize-4`
			`neg wq`
			`.loop:`
			`movu m1, [srcq+wq]`
			`mova m2, m1`
			`LSHIFT m1, 4`
			`paddb m1, m2`
			`pshufd m0, m0, q3333`
			`mova m2, m1`
			`LSHIFT m1, 8`
			`paddb m1, m2`
			`paddb m0, m1`
			`movu [dstq+wq], m0`
			`add wq, mmsize`
			`jl .loop`
			`movd m0, [dstq-4]`
			`movd [leftq], m0`
x86: replace explicit REP_RETs with RETs From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether. 2 years ago			`RET`
x86: huffyuvdsp: add_hfyu_left_pred_bgr32 C MMX SSE2 Cycles: 3092 1053 578 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 11 years ago
huffyuvdsp: move functions only used by huffyuv from lossless_videodsp Signed-off-by: James Almer <jamrial@gmail.com> 8 years ago
			`; void add_hfyu_median_prediction_mmxext(uint8_t dst, const uint8_t top, const uint8_t diff, int mask, int w, int left, int *left_top)`
			`INIT_MMX mmxext`
			`cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top`
			`add wd, wd`
			`movd mm6, maskd`
			`SPLATW mm6, mm6`
			`movq mm0, [topq]`
			`movq mm2, mm0`
			`movd mm4, [left_topq]`
			`psllq mm2, 16`
			`movq mm1, mm0`
			`por mm4, mm2`
			`movd mm3, [leftq]`
			`psubw mm0, mm4 ; t-tl`
			`add dstq, wq`
			`add topq, wq`
			`add diffq, wq`
			`neg wq`
			`jmp .skip`
			`.loop:`
			`movq mm4, [topq+wq]`
			`movq mm0, mm4`
			`psllq mm4, 16`
			`por mm4, mm1`
			`movq mm1, mm0 ; t`
			`psubw mm0, mm4 ; t-tl`
			`.skip:`
			`movq mm2, [diffq+wq]`
			`%assign i 0`
			`%rep 4`
			`movq mm4, mm0`
			`paddw mm4, mm3 ; t-tl+l`
			`pand mm4, mm6`
			`movq mm5, mm3`
			`pmaxsw mm3, mm1`
			`pminsw mm5, mm1`
			`pminsw mm3, mm4`
			`pmaxsw mm3, mm5 ; median`
			`paddw mm3, mm2 ; +residual`
			`pand mm3, mm6`
			`%if i==0`
			`movq mm7, mm3`
			`psllq mm7, 48`
			`%else`
			`movq mm4, mm3`
			`psrlq mm7, 16`
			`psllq mm4, 48`
			`por mm7, mm4`
			`%endif`
			`%if i<3`
			`psrlq mm0, 16`
			`psrlq mm1, 16`
			`psrlq mm2, 16`
			`%endif`
			`%assign i i+1`
			`%endrep`
			`movq [dstq+wq], mm7`
			`add wq, 8`
			`jl .loop`
			`movzx r2d, word [dstq-2]`
			`mov [leftq], r2d`
			`movzx r2d, word [topq-2]`
			`mov [left_topq], r2d`
			`RET`