help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunkoldabi
parent
3a0885146c
commit
89fa3504ed
7 changed files with 178 additions and 270 deletions
@ -0,0 +1,170 @@ |
||||
;****************************************************************************** |
||||
;* MMX/SSE2-optimized functions for the VP6 decoder |
||||
;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> |
||||
;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "x86inc.asm" |
||||
%include "x86util.asm" |
||||
|
||||
cextern pw_64 |
||||
|
||||
SECTION .text |
||||
|
||||
%macro DIAG4_MMX 6 |
||||
movq m0, [%1+%2] |
||||
movq m1, [%1+%3] |
||||
movq m3, m0 |
||||
movq m4, m1 |
||||
punpcklbw m0, m7 |
||||
punpcklbw m1, m7 |
||||
punpckhbw m3, m7 |
||||
punpckhbw m4, m7 |
||||
pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] |
||||
pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] |
||||
pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] |
||||
pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] |
||||
paddw m0, m1 |
||||
paddw m3, m4 |
||||
movq m1, [%1+%4] |
||||
movq m2, [%1+%5] |
||||
movq m4, m1 |
||||
movq m5, m2 |
||||
punpcklbw m1, m7 |
||||
punpcklbw m2, m7 |
||||
punpcklbw m4, m7 |
||||
punpcklbw m5, m7 |
||||
pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] |
||||
pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] |
||||
pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] |
||||
pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] |
||||
paddw m1, m2 |
||||
paddw m4, m5 |
||||
paddsw m0, m1 |
||||
paddsw m3, m4 |
||||
paddsw m0, m6 ; Add 64 |
||||
paddsw m3, m6 ; Add 64 |
||||
psraw m0, 7 |
||||
psraw m3, 7 |
||||
packuswb m0, m3 |
||||
movq [%6], m0 |
||||
%endmacro |
||||
|
||||
%macro DIAG4_SSE2 6 |
||||
movq m0, [%1+%2] |
||||
movq m1, [%1+%3] |
||||
punpcklbw m0, m7 |
||||
punpcklbw m1, m7 |
||||
pmullw m0, m4 ; src[x-8 ] * biweight [0] |
||||
pmullw m1, m5 ; src[x ] * biweight [1] |
||||
paddw m0, m1 |
||||
movq m1, [%1+%4] |
||||
movq m2, [%1+%5] |
||||
punpcklbw m1, m7 |
||||
punpcklbw m2, m7 |
||||
pmullw m1, m6 ; src[x+8 ] * biweight [2] |
||||
pmullw m2, m3 ; src[x+16] * biweight [3] |
||||
paddw m1, m2 |
||||
paddsw m0, m1 |
||||
paddsw m0, [pw_64] ; Add 64 |
||||
psraw m0, 7 |
||||
packuswb m0, m0 |
||||
movq [%6], m0 |
||||
%endmacro |
||||
|
||||
%macro SPLAT4REGS_MMX 0 |
||||
movq m5, m3 |
||||
punpcklwd m3, m3 |
||||
movq m4, m3 |
||||
punpckldq m3, m3 |
||||
punpckhdq m4, m4 |
||||
punpckhwd m5, m5 |
||||
movq m6, m5 |
||||
punpckhdq m6, m6 |
||||
punpckldq m5, m5 |
||||
movq [rsp+8*11], m3 |
||||
movq [rsp+8*12], m4 |
||||
movq [rsp+8*13], m5 |
||||
movq [rsp+8*14], m6 |
||||
%endmacro |
||||
|
||||
%macro SPLAT4REGS_SSE2 0 |
||||
pshuflw m4, m3, 0x0 |
||||
pshuflw m5, m3, 0x55 |
||||
pshuflw m6, m3, 0xAA |
||||
pshuflw m3, m3, 0xFF |
||||
punpcklqdq m4, m4 |
||||
punpcklqdq m5, m5 |
||||
punpcklqdq m6, m6 |
||||
punpcklqdq m3, m3 |
||||
%endmacro |
||||
|
||||
%macro vp6_filter_diag4 2 |
||||
; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride, |
||||
; const int16_t h_weight[4], const int16_t v_weights[4]) |
||||
cglobal vp6_filter_diag4_%1, 5, 7, %2 |
||||
mov r5, rsp ; backup stack pointer |
||||
and rsp, ~(mmsize-1) ; align stack |
||||
%ifidn %1, sse2 |
||||
sub rsp, 8*11 |
||||
%else |
||||
sub rsp, 8*15 |
||||
movq m6, [pw_64] |
||||
%endif |
||||
|
||||
sub r1, r2 |
||||
|
||||
pxor m7, m7 |
||||
movq m3, [r3] |
||||
SPLAT4REGS |
||||
|
||||
mov r3, rsp |
||||
mov r6, 11 |
||||
.nextrow |
||||
DIAG4 r1, -1, 0, 1, 2, r3 |
||||
add r3, 8 |
||||
add r1, r2 |
||||
dec r6 |
||||
jnz .nextrow |
||||
|
||||
movq m3, [r4] |
||||
SPLAT4REGS |
||||
|
||||
lea r3, [rsp+8] |
||||
mov r6, 8 |
||||
.nextcol |
||||
DIAG4 r3, -8, 0, 8, 16, r0 |
||||
add r3, 8 |
||||
add r0, r2 |
||||
dec r6 |
||||
jnz .nextcol |
||||
|
||||
mov rsp, r5 ; restore stack pointer |
||||
RET |
||||
%endmacro |
||||
|
||||
INIT_MMX |
||||
%define DIAG4 DIAG4_MMX |
||||
%define SPLAT4REGS SPLAT4REGS_MMX |
||||
vp6_filter_diag4 mmx, 0 |
||||
|
||||
INIT_XMM |
||||
%define DIAG4 DIAG4_SSE2 |
||||
%define SPLAT4REGS SPLAT4REGS_SSE2 |
||||
vp6_filter_diag4 sse2, 8 |
@ -1,108 +0,0 @@ |
||||
/**
|
||||
* @file |
||||
* MMX-optimized functions for the VP6 decoder |
||||
* |
||||
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavutil/x86_cpu.h" |
||||
#include "libavcodec/dsputil.h" |
||||
#include "dsputil_mmx.h" |
||||
#include "vp6dsp_mmx.h" |
||||
|
||||
|
||||
#define DIAG4_MMX(in1,in2,in3,in4) \ |
||||
"movq "#in1"(%0), %%mm0 \n\t" \
|
||||
"movq "#in2"(%0), %%mm1 \n\t" \
|
||||
"movq %%mm0, %%mm3 \n\t" \
|
||||
"movq %%mm1, %%mm4 \n\t" \
|
||||
"punpcklbw %%mm7, %%mm0 \n\t" \
|
||||
"punpcklbw %%mm7, %%mm1 \n\t" \
|
||||
"punpckhbw %%mm7, %%mm3 \n\t" \
|
||||
"punpckhbw %%mm7, %%mm4 \n\t" \
|
||||
"pmullw 0(%2), %%mm0 \n\t" /* src[x-8 ] * biweight [0] */ \
|
||||
"pmullw 8(%2), %%mm1 \n\t" /* src[x ] * biweight [1] */ \
|
||||
"pmullw 0(%2), %%mm3 \n\t" /* src[x-8 ] * biweight [0] */ \
|
||||
"pmullw 8(%2), %%mm4 \n\t" /* src[x ] * biweight [1] */ \
|
||||
"paddw %%mm1, %%mm0 \n\t" \
|
||||
"paddw %%mm4, %%mm3 \n\t" \
|
||||
"movq "#in3"(%0), %%mm1 \n\t" \
|
||||
"movq "#in4"(%0), %%mm2 \n\t" \
|
||||
"movq %%mm1, %%mm4 \n\t" \
|
||||
"movq %%mm2, %%mm5 \n\t" \
|
||||
"punpcklbw %%mm7, %%mm1 \n\t" \
|
||||
"punpcklbw %%mm7, %%mm2 \n\t" \
|
||||
"punpckhbw %%mm7, %%mm4 \n\t" \
|
||||
"punpckhbw %%mm7, %%mm5 \n\t" \
|
||||
"pmullw 16(%2), %%mm1 \n\t" /* src[x+8 ] * biweight [2] */ \
|
||||
"pmullw 24(%2), %%mm2 \n\t" /* src[x+16] * biweight [3] */ \
|
||||
"pmullw 16(%2), %%mm4 \n\t" /* src[x+8 ] * biweight [2] */ \
|
||||
"pmullw 24(%2), %%mm5 \n\t" /* src[x+16] * biweight [3] */ \
|
||||
"paddw %%mm2, %%mm1 \n\t" \
|
||||
"paddw %%mm5, %%mm4 \n\t" \
|
||||
"paddsw %%mm1, %%mm0 \n\t" \
|
||||
"paddsw %%mm4, %%mm3 \n\t" \
|
||||
"paddsw %%mm6, %%mm0 \n\t" /* Add 64 */ \
|
||||
"paddsw %%mm6, %%mm3 \n\t" /* Add 64 */ \
|
||||
"psraw $7, %%mm0 \n\t" \
|
||||
"psraw $7, %%mm3 \n\t" \
|
||||
"packuswb %%mm3, %%mm0 \n\t" \
|
||||
"movq %%mm0, (%1) \n\t" |
||||
|
||||
void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride, |
||||
const int16_t *h_weights, const int16_t *v_weights) |
||||
{ |
||||
uint8_t tmp[8*11], *t = tmp; |
||||
int16_t weights[4*4]; |
||||
int i; |
||||
src -= stride; |
||||
|
||||
for (i=0; i<4*4; i++) |
||||
weights[i] = h_weights[i>>2]; |
||||
|
||||
__asm__ volatile( |
||||
"pxor %%mm7, %%mm7 \n\t" |
||||
"movq "MANGLE(ff_pw_64)", %%mm6 \n\t" |
||||
"1: \n\t" |
||||
DIAG4_MMX(-1,0,1,2) |
||||
"add $8, %1 \n\t" |
||||
"add %3, %0 \n\t" |
||||
"decl %4 \n\t" |
||||
"jnz 1b \n\t" |
||||
: "+r"(src), "+r"(t) |
||||
: "r"(weights), "r"((x86_reg)stride), "r"(11) |
||||
: "memory"); |
||||
|
||||
t = tmp + 8; |
||||
for (i=0; i<4*4; i++) |
||||
weights[i] = v_weights[i>>2]; |
||||
|
||||
__asm__ volatile( |
||||
"pxor %%mm7, %%mm7 \n\t" |
||||
"movq "MANGLE(ff_pw_64)", %%mm6 \n\t" |
||||
"1: \n\t" |
||||
DIAG4_MMX(-8,0,8,16) |
||||
"add $8, %0 \n\t" |
||||
"add %3, %1 \n\t" |
||||
"decl %4 \n\t" |
||||
"jnz 1b \n\t" |
||||
: "+r"(t), "+r"(dst) |
||||
: "r"(weights), "r"((x86_reg)stride), "r"(8) |
||||
: "memory"); |
||||
} |
@ -1,30 +0,0 @@ |
||||
/*
|
||||
* vp6dsp MMX function declarations |
||||
* Copyright (c) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_X86_VP6DSP_MMX_H |
||||
#define AVCODEC_X86_VP6DSP_MMX_H |
||||
|
||||
#include <stdint.h> |
||||
|
||||
void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride, |
||||
const int16_t *h_weights,const int16_t *v_weights); |
||||
|
||||
#endif /* AVCODEC_X86_VP6DSP_MMX_H */ |
@ -1,98 +0,0 @@ |
||||
/**
|
||||
* @file |
||||
* SSE2-optimized functions for the VP6 decoder |
||||
* |
||||
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavutil/x86_cpu.h" |
||||
#include "libavcodec/dsputil.h" |
||||
#include "dsputil_mmx.h" |
||||
#include "vp6dsp_sse2.h" |
||||
|
||||
#define DIAG4_SSE2(in1,in2,in3,in4) \ |
||||
"movq "#in1"(%0), %%xmm0 \n\t" \
|
||||
"movq "#in2"(%0), %%xmm1 \n\t" \
|
||||
"punpcklbw %%xmm7, %%xmm0 \n\t" \
|
||||
"punpcklbw %%xmm7, %%xmm1 \n\t" \
|
||||
"pmullw %%xmm4, %%xmm0 \n\t" /* src[x-8 ] * biweight [0] */ \
|
||||
"pmullw %%xmm5, %%xmm1 \n\t" /* src[x ] * biweight [1] */ \
|
||||
"paddw %%xmm1, %%xmm0 \n\t" \
|
||||
"movq "#in3"(%0), %%xmm1 \n\t" \
|
||||
"movq "#in4"(%0), %%xmm2 \n\t" \
|
||||
"punpcklbw %%xmm7, %%xmm1 \n\t" \
|
||||
"punpcklbw %%xmm7, %%xmm2 \n\t" \
|
||||
"pmullw %%xmm6, %%xmm1 \n\t" /* src[x+8 ] * biweight [2] */ \
|
||||
"pmullw %%xmm3, %%xmm2 \n\t" /* src[x+16] * biweight [3] */ \
|
||||
"paddw %%xmm2, %%xmm1 \n\t" \
|
||||
"paddsw %%xmm1, %%xmm0 \n\t" \
|
||||
"paddsw "MANGLE(ff_pw_64)", %%xmm0 \n\t" /* Add 64 */ \
|
||||
"psraw $7, %%xmm0 \n\t" \
|
||||
"packuswb %%xmm0, %%xmm0 \n\t" \
|
||||
"movq %%xmm0, (%1) \n\t" \
|
||||
|
||||
void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, |
||||
const int16_t *h_weights,const int16_t *v_weights) |
||||
{ |
||||
uint8_t tmp[8*11], *t = tmp; |
||||
src -= stride; |
||||
|
||||
__asm__ volatile( |
||||
"pxor %%xmm7, %%xmm7 \n\t" |
||||
"movq %4, %%xmm3 \n\t" |
||||
"pshuflw $0, %%xmm3, %%xmm4 \n\t" |
||||
"punpcklqdq %%xmm4, %%xmm4 \n\t" |
||||
"pshuflw $85, %%xmm3, %%xmm5 \n\t" |
||||
"punpcklqdq %%xmm5, %%xmm5 \n\t" |
||||
"pshuflw $170, %%xmm3, %%xmm6 \n\t" |
||||
"punpcklqdq %%xmm6, %%xmm6 \n\t" |
||||
"pshuflw $255, %%xmm3, %%xmm3 \n\t" |
||||
"punpcklqdq %%xmm3, %%xmm3 \n\t" |
||||
"1: \n\t" |
||||
DIAG4_SSE2(-1,0,1,2) |
||||
"add $8, %1 \n\t" |
||||
"add %2, %0 \n\t" |
||||
"decl %3 \n\t" |
||||
"jnz 1b \n\t" |
||||
: "+r"(src), "+r"(t) |
||||
: "g"((x86_reg)stride), "r"(11), "m"(*(const int64_t*)h_weights) |
||||
: "memory"); |
||||
|
||||
t = tmp + 8; |
||||
|
||||
__asm__ volatile( |
||||
"movq %4, %%xmm3 \n\t" |
||||
"pshuflw $0, %%xmm3, %%xmm4 \n\t" |
||||
"punpcklqdq %%xmm4, %%xmm4 \n\t" |
||||
"pshuflw $85, %%xmm3, %%xmm5 \n\t" |
||||
"punpcklqdq %%xmm5, %%xmm5 \n\t" |
||||
"pshuflw $170, %%xmm3, %%xmm6 \n\t" |
||||
"punpcklqdq %%xmm6, %%xmm6 \n\t" |
||||
"pshuflw $255, %%xmm3, %%xmm3 \n\t" |
||||
"punpcklqdq %%xmm3, %%xmm3 \n\t" |
||||
"1: \n\t" |
||||
DIAG4_SSE2(-8,0,8,16) |
||||
"add $8, %0 \n\t" |
||||
"add %2, %1 \n\t" |
||||
"decl %3 \n\t" |
||||
"jnz 1b \n\t" |
||||
: "+r"(t), "+r"(dst) |
||||
: "g"((x86_reg)stride), "r"(8), "m"(*(const int64_t*)v_weights) |
||||
: "memory"); |
||||
} |
@ -1,30 +0,0 @@ |
||||
/*
|
||||
* vp6dsp SSE2 function declarations |
||||
* Copyright (c) 2009 Zuxy Meng <zuxy.meng@gmail.com> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_X86_VP6DSP_SSE2_H |
||||
#define AVCODEC_X86_VP6DSP_SSE2_H |
||||
|
||||
#include <stdint.h> |
||||
|
||||
void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, |
||||
const int16_t *h_weights,const int16_t *v_weights); |
||||
|
||||
#endif /* AVCODEC_X86_VP6DSP_SSE2_H */ |
Loading…
Reference in new issue