mirror of https://github.com/FFmpeg/FFmpeg.git
This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com>oldabi
parent
770c410fbb
commit
c73d99e672
32 changed files with 1204 additions and 882 deletions
@ -0,0 +1,48 @@ |
||||
/*
|
||||
* ARM optimized Format Conversion Utils |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include <stdint.h> |
||||
|
||||
#include "libavcodec/avcodec.h" |
||||
#include "libavcodec/fmtconvert.h" |
||||
|
||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, |
||||
float mul, int len); |
||||
|
||||
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); |
||||
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); |
||||
|
||||
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); |
||||
|
||||
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) |
||||
{ |
||||
if (HAVE_ARMVFP && HAVE_ARMV6) { |
||||
c->float_to_int16 = ff_float_to_int16_vfp; |
||||
} |
||||
|
||||
if (HAVE_NEON) { |
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; |
||||
|
||||
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
||||
c->float_to_int16 = ff_float_to_int16_neon; |
||||
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,391 @@ |
||||
/* |
||||
* ARM NEON optimised Format Conversion Utils |
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "config.h" |
||||
#include "asm.S" |
||||
|
||||
preserve8 |
||||
.text |
||||
|
||||
function ff_float_to_int16_neon, export=1 |
||||
subs r2, r2, #8 |
||||
vld1.64 {d0-d1}, [r1,:128]! |
||||
vcvt.s32.f32 q8, q0, #16 |
||||
vld1.64 {d2-d3}, [r1,:128]! |
||||
vcvt.s32.f32 q9, q1, #16 |
||||
beq 3f |
||||
bics ip, r2, #15 |
||||
beq 2f |
||||
1: subs ip, ip, #16 |
||||
vshrn.s32 d4, q8, #16 |
||||
vld1.64 {d0-d1}, [r1,:128]! |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vshrn.s32 d5, q9, #16 |
||||
vld1.64 {d2-d3}, [r1,:128]! |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
vshrn.s32 d6, q0, #16 |
||||
vst1.64 {d4-d5}, [r0,:128]! |
||||
vshrn.s32 d7, q1, #16 |
||||
vld1.64 {d16-d17},[r1,:128]! |
||||
vcvt.s32.f32 q8, q8, #16 |
||||
vld1.64 {d18-d19},[r1,:128]! |
||||
vcvt.s32.f32 q9, q9, #16 |
||||
vst1.64 {d6-d7}, [r0,:128]! |
||||
bne 1b |
||||
ands r2, r2, #15 |
||||
beq 3f |
||||
2: vld1.64 {d0-d1}, [r1,:128]! |
||||
vshrn.s32 d4, q8, #16 |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vld1.64 {d2-d3}, [r1,:128]! |
||||
vshrn.s32 d5, q9, #16 |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
vshrn.s32 d6, q0, #16 |
||||
vst1.64 {d4-d5}, [r0,:128]! |
||||
vshrn.s32 d7, q1, #16 |
||||
vst1.64 {d6-d7}, [r0,:128]! |
||||
bx lr |
||||
3: vshrn.s32 d4, q8, #16 |
||||
vshrn.s32 d5, q9, #16 |
||||
vst1.64 {d4-d5}, [r0,:128]! |
||||
bx lr |
||||
endfunc |
||||
|
||||
function ff_float_to_int16_interleave_neon, export=1 |
||||
cmp r3, #2 |
||||
ldrlt r1, [r1] |
||||
blt ff_float_to_int16_neon |
||||
bne 4f |
||||
|
||||
ldr r3, [r1] |
||||
ldr r1, [r1, #4] |
||||
|
||||
subs r2, r2, #8 |
||||
vld1.64 {d0-d1}, [r3,:128]! |
||||
vcvt.s32.f32 q8, q0, #16 |
||||
vld1.64 {d2-d3}, [r3,:128]! |
||||
vcvt.s32.f32 q9, q1, #16 |
||||
vld1.64 {d20-d21},[r1,:128]! |
||||
vcvt.s32.f32 q10, q10, #16 |
||||
vld1.64 {d22-d23},[r1,:128]! |
||||
vcvt.s32.f32 q11, q11, #16 |
||||
beq 3f |
||||
bics ip, r2, #15 |
||||
beq 2f |
||||
1: subs ip, ip, #16 |
||||
vld1.64 {d0-d1}, [r3,:128]! |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vsri.32 q10, q8, #16 |
||||
vld1.64 {d2-d3}, [r3,:128]! |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
vld1.64 {d24-d25},[r1,:128]! |
||||
vcvt.s32.f32 q12, q12, #16 |
||||
vld1.64 {d26-d27},[r1,:128]! |
||||
vsri.32 q11, q9, #16 |
||||
vst1.64 {d20-d21},[r0,:128]! |
||||
vcvt.s32.f32 q13, q13, #16 |
||||
vst1.64 {d22-d23},[r0,:128]! |
||||
vsri.32 q12, q0, #16 |
||||
vld1.64 {d16-d17},[r3,:128]! |
||||
vsri.32 q13, q1, #16 |
||||
vst1.64 {d24-d25},[r0,:128]! |
||||
vcvt.s32.f32 q8, q8, #16 |
||||
vld1.64 {d18-d19},[r3,:128]! |
||||
vcvt.s32.f32 q9, q9, #16 |
||||
vld1.64 {d20-d21},[r1,:128]! |
||||
vcvt.s32.f32 q10, q10, #16 |
||||
vld1.64 {d22-d23},[r1,:128]! |
||||
vcvt.s32.f32 q11, q11, #16 |
||||
vst1.64 {d26-d27},[r0,:128]! |
||||
bne 1b |
||||
ands r2, r2, #15 |
||||
beq 3f |
||||
2: vsri.32 q10, q8, #16 |
||||
vld1.64 {d0-d1}, [r3,:128]! |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vld1.64 {d2-d3}, [r3,:128]! |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
vld1.64 {d24-d25},[r1,:128]! |
||||
vcvt.s32.f32 q12, q12, #16 |
||||
vsri.32 q11, q9, #16 |
||||
vld1.64 {d26-d27},[r1,:128]! |
||||
vcvt.s32.f32 q13, q13, #16 |
||||
vst1.64 {d20-d21},[r0,:128]! |
||||
vsri.32 q12, q0, #16 |
||||
vst1.64 {d22-d23},[r0,:128]! |
||||
vsri.32 q13, q1, #16 |
||||
vst1.64 {d24-d27},[r0,:128]! |
||||
bx lr |
||||
3: vsri.32 q10, q8, #16 |
||||
vsri.32 q11, q9, #16 |
||||
vst1.64 {d20-d23},[r0,:128]! |
||||
bx lr |
||||
|
||||
4: push {r4-r8,lr} |
||||
cmp r3, #4 |
||||
lsl ip, r3, #1 |
||||
blt 4f |
||||
|
||||
@ 4 channels
|
||||
5: ldmia r1!, {r4-r7} |
||||
mov lr, r2 |
||||
mov r8, r0 |
||||
vld1.64 {d16-d17},[r4,:128]! |
||||
vcvt.s32.f32 q8, q8, #16 |
||||
vld1.64 {d18-d19},[r5,:128]! |
||||
vcvt.s32.f32 q9, q9, #16 |
||||
vld1.64 {d20-d21},[r6,:128]! |
||||
vcvt.s32.f32 q10, q10, #16 |
||||
vld1.64 {d22-d23},[r7,:128]! |
||||
vcvt.s32.f32 q11, q11, #16 |
||||
6: subs lr, lr, #8 |
||||
vld1.64 {d0-d1}, [r4,:128]! |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vsri.32 q9, q8, #16 |
||||
vld1.64 {d2-d3}, [r5,:128]! |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
vsri.32 q11, q10, #16 |
||||
vld1.64 {d4-d5}, [r6,:128]! |
||||
vcvt.s32.f32 q2, q2, #16 |
||||
vzip.32 d18, d22 |
||||
vld1.64 {d6-d7}, [r7,:128]! |
||||
vcvt.s32.f32 q3, q3, #16 |
||||
vzip.32 d19, d23 |
||||
vst1.64 {d18}, [r8], ip |
||||
vsri.32 q1, q0, #16 |
||||
vst1.64 {d22}, [r8], ip |
||||
vsri.32 q3, q2, #16 |
||||
vst1.64 {d19}, [r8], ip |
||||
vzip.32 d2, d6 |
||||
vst1.64 {d23}, [r8], ip |
||||
vzip.32 d3, d7 |
||||
beq 7f |
||||
vld1.64 {d16-d17},[r4,:128]! |
||||
vcvt.s32.f32 q8, q8, #16 |
||||
vst1.64 {d2}, [r8], ip |
||||
vld1.64 {d18-d19},[r5,:128]! |
||||
vcvt.s32.f32 q9, q9, #16 |
||||
vst1.64 {d6}, [r8], ip |
||||
vld1.64 {d20-d21},[r6,:128]! |
||||
vcvt.s32.f32 q10, q10, #16 |
||||
vst1.64 {d3}, [r8], ip |
||||
vld1.64 {d22-d23},[r7,:128]! |
||||
vcvt.s32.f32 q11, q11, #16 |
||||
vst1.64 {d7}, [r8], ip |
||||
b 6b |
||||
7: vst1.64 {d2}, [r8], ip |
||||
vst1.64 {d6}, [r8], ip |
||||
vst1.64 {d3}, [r8], ip |
||||
vst1.64 {d7}, [r8], ip |
||||
subs r3, r3, #4 |
||||
popeq {r4-r8,pc} |
||||
cmp r3, #4 |
||||
add r0, r0, #8 |
||||
bge 5b |
||||
|
||||
@ 2 channels
|
||||
4: cmp r3, #2 |
||||
blt 4f |
||||
ldmia r1!, {r4-r5} |
||||
mov lr, r2 |
||||
mov r8, r0 |
||||
tst lr, #8 |
||||
vld1.64 {d16-d17},[r4,:128]! |
||||
vcvt.s32.f32 q8, q8, #16 |
||||
vld1.64 {d18-d19},[r5,:128]! |
||||
vcvt.s32.f32 q9, q9, #16 |
||||
vld1.64 {d20-d21},[r4,:128]! |
||||
vcvt.s32.f32 q10, q10, #16 |
||||
vld1.64 {d22-d23},[r5,:128]! |
||||
vcvt.s32.f32 q11, q11, #16 |
||||
beq 6f |
||||
subs lr, lr, #8 |
||||
beq 7f |
||||
vsri.32 d18, d16, #16 |
||||
vsri.32 d19, d17, #16 |
||||
vld1.64 {d16-d17},[r4,:128]! |
||||
vcvt.s32.f32 q8, q8, #16 |
||||
vst1.32 {d18[0]}, [r8], ip |
||||
vsri.32 d22, d20, #16 |
||||
vst1.32 {d18[1]}, [r8], ip |
||||
vsri.32 d23, d21, #16 |
||||
vst1.32 {d19[0]}, [r8], ip |
||||
vst1.32 {d19[1]}, [r8], ip |
||||
vld1.64 {d18-d19},[r5,:128]! |
||||
vcvt.s32.f32 q9, q9, #16 |
||||
vst1.32 {d22[0]}, [r8], ip |
||||
vst1.32 {d22[1]}, [r8], ip |
||||
vld1.64 {d20-d21},[r4,:128]! |
||||
vcvt.s32.f32 q10, q10, #16 |
||||
vst1.32 {d23[0]}, [r8], ip |
||||
vst1.32 {d23[1]}, [r8], ip |
||||
vld1.64 {d22-d23},[r5,:128]! |
||||
vcvt.s32.f32 q11, q11, #16 |
||||
6: subs lr, lr, #16 |
||||
vld1.64 {d0-d1}, [r4,:128]! |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vsri.32 d18, d16, #16 |
||||
vld1.64 {d2-d3}, [r5,:128]! |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
vsri.32 d19, d17, #16 |
||||
vld1.64 {d4-d5}, [r4,:128]! |
||||
vcvt.s32.f32 q2, q2, #16 |
||||
vld1.64 {d6-d7}, [r5,:128]! |
||||
vcvt.s32.f32 q3, q3, #16 |
||||
vst1.32 {d18[0]}, [r8], ip |
||||
vsri.32 d22, d20, #16 |
||||
vst1.32 {d18[1]}, [r8], ip |
||||
vsri.32 d23, d21, #16 |
||||
vst1.32 {d19[0]}, [r8], ip |
||||
vsri.32 d2, d0, #16 |
||||
vst1.32 {d19[1]}, [r8], ip |
||||
vsri.32 d3, d1, #16 |
||||
vst1.32 {d22[0]}, [r8], ip |
||||
vsri.32 d6, d4, #16 |
||||
vst1.32 {d22[1]}, [r8], ip |
||||
vsri.32 d7, d5, #16 |
||||
vst1.32 {d23[0]}, [r8], ip |
||||
vst1.32 {d23[1]}, [r8], ip |
||||
beq 6f |
||||
vld1.64 {d16-d17},[r4,:128]! |
||||
vcvt.s32.f32 q8, q8, #16 |
||||
vst1.32 {d2[0]}, [r8], ip |
||||
vst1.32 {d2[1]}, [r8], ip |
||||
vld1.64 {d18-d19},[r5,:128]! |
||||
vcvt.s32.f32 q9, q9, #16 |
||||
vst1.32 {d3[0]}, [r8], ip |
||||
vst1.32 {d3[1]}, [r8], ip |
||||
vld1.64 {d20-d21},[r4,:128]! |
||||
vcvt.s32.f32 q10, q10, #16 |
||||
vst1.32 {d6[0]}, [r8], ip |
||||
vst1.32 {d6[1]}, [r8], ip |
||||
vld1.64 {d22-d23},[r5,:128]! |
||||
vcvt.s32.f32 q11, q11, #16 |
||||
vst1.32 {d7[0]}, [r8], ip |
||||
vst1.32 {d7[1]}, [r8], ip |
||||
bgt 6b |
||||
6: vst1.32 {d2[0]}, [r8], ip |
||||
vst1.32 {d2[1]}, [r8], ip |
||||
vst1.32 {d3[0]}, [r8], ip |
||||
vst1.32 {d3[1]}, [r8], ip |
||||
vst1.32 {d6[0]}, [r8], ip |
||||
vst1.32 {d6[1]}, [r8], ip |
||||
vst1.32 {d7[0]}, [r8], ip |
||||
vst1.32 {d7[1]}, [r8], ip |
||||
b 8f |
||||
7: vsri.32 d18, d16, #16 |
||||
vsri.32 d19, d17, #16 |
||||
vst1.32 {d18[0]}, [r8], ip |
||||
vsri.32 d22, d20, #16 |
||||
vst1.32 {d18[1]}, [r8], ip |
||||
vsri.32 d23, d21, #16 |
||||
vst1.32 {d19[0]}, [r8], ip |
||||
vst1.32 {d19[1]}, [r8], ip |
||||
vst1.32 {d22[0]}, [r8], ip |
||||
vst1.32 {d22[1]}, [r8], ip |
||||
vst1.32 {d23[0]}, [r8], ip |
||||
vst1.32 {d23[1]}, [r8], ip |
||||
8: subs r3, r3, #2 |
||||
add r0, r0, #4 |
||||
popeq {r4-r8,pc} |
||||
|
||||
@ 1 channel
|
||||
4: ldr r4, [r1],#4 |
||||
tst r2, #8 |
||||
mov lr, r2 |
||||
mov r5, r0 |
||||
vld1.64 {d0-d1}, [r4,:128]! |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vld1.64 {d2-d3}, [r4,:128]! |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
bne 8f |
||||
6: subs lr, lr, #16 |
||||
vld1.64 {d4-d5}, [r4,:128]! |
||||
vcvt.s32.f32 q2, q2, #16 |
||||
vld1.64 {d6-d7}, [r4,:128]! |
||||
vcvt.s32.f32 q3, q3, #16 |
||||
vst1.16 {d0[1]}, [r5,:16], ip |
||||
vst1.16 {d0[3]}, [r5,:16], ip |
||||
vst1.16 {d1[1]}, [r5,:16], ip |
||||
vst1.16 {d1[3]}, [r5,:16], ip |
||||
vst1.16 {d2[1]}, [r5,:16], ip |
||||
vst1.16 {d2[3]}, [r5,:16], ip |
||||
vst1.16 {d3[1]}, [r5,:16], ip |
||||
vst1.16 {d3[3]}, [r5,:16], ip |
||||
beq 7f |
||||
vld1.64 {d0-d1}, [r4,:128]! |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vld1.64 {d2-d3}, [r4,:128]! |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
7: vst1.16 {d4[1]}, [r5,:16], ip |
||||
vst1.16 {d4[3]}, [r5,:16], ip |
||||
vst1.16 {d5[1]}, [r5,:16], ip |
||||
vst1.16 {d5[3]}, [r5,:16], ip |
||||
vst1.16 {d6[1]}, [r5,:16], ip |
||||
vst1.16 {d6[3]}, [r5,:16], ip |
||||
vst1.16 {d7[1]}, [r5,:16], ip |
||||
vst1.16 {d7[3]}, [r5,:16], ip |
||||
bgt 6b |
||||
pop {r4-r8,pc} |
||||
8: subs lr, lr, #8 |
||||
vst1.16 {d0[1]}, [r5,:16], ip |
||||
vst1.16 {d0[3]}, [r5,:16], ip |
||||
vst1.16 {d1[1]}, [r5,:16], ip |
||||
vst1.16 {d1[3]}, [r5,:16], ip |
||||
vst1.16 {d2[1]}, [r5,:16], ip |
||||
vst1.16 {d2[3]}, [r5,:16], ip |
||||
vst1.16 {d3[1]}, [r5,:16], ip |
||||
vst1.16 {d3[3]}, [r5,:16], ip |
||||
popeq {r4-r8,pc} |
||||
vld1.64 {d0-d1}, [r4,:128]! |
||||
vcvt.s32.f32 q0, q0, #16 |
||||
vld1.64 {d2-d3}, [r4,:128]! |
||||
vcvt.s32.f32 q1, q1, #16 |
||||
b 6b |
||||
endfunc |
||||
|
||||
function ff_int32_to_float_fmul_scalar_neon, export=1 |
||||
VFP vdup.32 q0, d0[0] |
||||
VFP len .req r2 |
||||
NOVFP vdup.32 q0, r2 |
||||
NOVFP len .req r3 |
||||
|
||||
vld1.32 {q1},[r1,:128]! |
||||
vcvt.f32.s32 q3, q1 |
||||
vld1.32 {q2},[r1,:128]! |
||||
vcvt.f32.s32 q8, q2 |
||||
1: subs len, len, #8 |
||||
pld [r1, #16] |
||||
vmul.f32 q9, q3, q0 |
||||
vmul.f32 q10, q8, q0 |
||||
beq 2f |
||||
vld1.32 {q1},[r1,:128]! |
||||
vcvt.f32.s32 q3, q1 |
||||
vld1.32 {q2},[r1,:128]! |
||||
vcvt.f32.s32 q8, q2 |
||||
vst1.32 {q9}, [r0,:128]! |
||||
vst1.32 {q10},[r0,:128]! |
||||
b 1b |
||||
2: vst1.32 {q9}, [r0,:128]! |
||||
vst1.32 {q10},[r0,:128]! |
||||
bx lr |
||||
.unreq len
|
||||
endfunc |
@ -0,0 +1,77 @@ |
||||
/* |
||||
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
|
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "config.h" |
||||
#include "asm.S" |
||||
|
||||
.syntax unified
|
||||
|
||||
/** |
||||
* ARM VFP optimized float to int16 conversion. |
||||
* Assume that len is a positive number and is multiple of 8, destination |
||||
* buffer is at least 4 bytes aligned (8 bytes alignment is better for |
||||
* performance), little endian byte sex |
||||
*/ |
||||
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
|
||||
function ff_float_to_int16_vfp, export=1 |
||||
push {r4-r8,lr} |
||||
vpush {d8-d11} |
||||
vldmia r1!, {s16-s23} |
||||
vcvt.s32.f32 s0, s16 |
||||
vcvt.s32.f32 s1, s17 |
||||
vcvt.s32.f32 s2, s18 |
||||
vcvt.s32.f32 s3, s19 |
||||
vcvt.s32.f32 s4, s20 |
||||
vcvt.s32.f32 s5, s21 |
||||
vcvt.s32.f32 s6, s22 |
||||
vcvt.s32.f32 s7, s23 |
||||
1: |
||||
subs r2, r2, #8 |
||||
vmov r3, r4, s0, s1 |
||||
vmov r5, r6, s2, s3 |
||||
vmov r7, r8, s4, s5 |
||||
vmov ip, lr, s6, s7 |
||||
vldmiagt r1!, {s16-s23} |
||||
ssat r4, #16, r4 |
||||
ssat r3, #16, r3 |
||||
ssat r6, #16, r6 |
||||
ssat r5, #16, r5 |
||||
pkhbt r3, r3, r4, lsl #16 |
||||
pkhbt r4, r5, r6, lsl #16 |
||||
vcvtgt.s32.f32 s0, s16 |
||||
vcvtgt.s32.f32 s1, s17 |
||||
vcvtgt.s32.f32 s2, s18 |
||||
vcvtgt.s32.f32 s3, s19 |
||||
vcvtgt.s32.f32 s4, s20 |
||||
vcvtgt.s32.f32 s5, s21 |
||||
vcvtgt.s32.f32 s6, s22 |
||||
vcvtgt.s32.f32 s7, s23 |
||||
ssat r8, #16, r8 |
||||
ssat r7, #16, r7 |
||||
ssat lr, #16, lr |
||||
ssat ip, #16, ip |
||||
pkhbt r5, r7, r8, lsl #16 |
||||
pkhbt r6, ip, lr, lsl #16 |
||||
stmia r0!, {r3-r6} |
||||
bgt 1b |
||||
|
||||
vpop {d8-d11} |
||||
pop {r4-r8,pc} |
||||
endfunc |
@ -0,0 +1,68 @@ |
||||
/*
|
||||
* Format Conversion Utils |
||||
* Copyright (c) 2000, 2001 Fabrice Bellard |
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "avcodec.h" |
||||
#include "fmtconvert.h" |
||||
|
||||
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ |
||||
int i; |
||||
for(i=0; i<len; i++) |
||||
dst[i] = src[i] * mul; |
||||
} |
||||
|
||||
static av_always_inline int float_to_int16_one(const float *src){ |
||||
return av_clip_int16(lrintf(*src)); |
||||
} |
||||
|
||||
static void float_to_int16_c(int16_t *dst, const float *src, long len) |
||||
{ |
||||
int i; |
||||
for(i=0; i<len; i++) |
||||
dst[i] = float_to_int16_one(src+i); |
||||
} |
||||
|
||||
static void float_to_int16_interleave_c(int16_t *dst, const float **src, |
||||
long len, int channels) |
||||
{ |
||||
int i,j,c; |
||||
if(channels==2){ |
||||
for(i=0; i<len; i++){ |
||||
dst[2*i] = float_to_int16_one(src[0]+i); |
||||
dst[2*i+1] = float_to_int16_one(src[1]+i); |
||||
} |
||||
}else{ |
||||
for(c=0; c<channels; c++) |
||||
for(i=0, j=c; i<len; i++, j+=channels) |
||||
dst[j] = float_to_int16_one(src[c]+i); |
||||
} |
||||
} |
||||
|
||||
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) |
||||
{ |
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
||||
c->float_to_int16 = float_to_int16_c; |
||||
c->float_to_int16_interleave = float_to_int16_interleave_c; |
||||
|
||||
if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); |
||||
if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx); |
||||
if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx); |
||||
} |
@ -0,0 +1,79 @@ |
||||
/*
|
||||
* Format Conversion Utils |
||||
* Copyright (c) 2000, 2001 Fabrice Bellard |
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#ifndef AVCODEC_FMTCONVERT_H |
||||
#define AVCODEC_FMTCONVERT_H |
||||
|
||||
#include "avcodec.h" |
||||
|
||||
typedef struct FmtConvertContext { |
||||
/**
|
||||
* Convert an array of int32_t to float and multiply by a float value. |
||||
* @param dst destination array of float. |
||||
* constraints: 16-byte aligned |
||||
* @param src source array of int32_t. |
||||
* constraints: 16-byte aligned |
||||
* @param len number of elements to convert. |
||||
* constraints: multiple of 8 |
||||
*/ |
||||
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); |
||||
|
||||
/**
|
||||
* Convert an array of float to an array of int16_t. |
||||
* |
||||
* Convert floats from in the range [-32768.0,32767.0] to ints |
||||
* without rescaling |
||||
* |
||||
* @param dst destination array of int16_t. |
||||
* constraints: 16-byte aligned |
||||
* @param src source array of float. |
||||
* constraints: 16-byte aligned |
||||
* @param len number of elements to convert. |
||||
* constraints: multiple of 8 |
||||
*/ |
||||
void (*float_to_int16)(int16_t *dst, const float *src, long len); |
||||
|
||||
/**
|
||||
* Convert multiple arrays of float to an interleaved array of int16_t. |
||||
* |
||||
* Convert floats from in the range [-32768.0,32767.0] to ints |
||||
* without rescaling |
||||
* |
||||
* @param dst destination array of interleaved int16_t. |
||||
* constraints: 16-byte aligned |
||||
* @param src source array of float arrays, one for each channel. |
||||
* constraints: 16-byte aligned |
||||
* @param len number of elements to convert. |
||||
* constraints: multiple of 8 |
||||
* @param channels number of channels |
||||
*/ |
||||
void (*float_to_int16_interleave)(int16_t *dst, const float **src, |
||||
long len, int channels); |
||||
} FmtConvertContext; |
||||
|
||||
void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); |
||||
|
||||
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); |
||||
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx); |
||||
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx); |
||||
|
||||
#endif /* AVCODEC_FMTCONVERT_H */ |
@ -0,0 +1,142 @@ |
||||
/*
|
||||
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavcodec/fmtconvert.h" |
||||
|
||||
#include "dsputil_altivec.h" |
||||
#include "util_altivec.h" |
||||
|
||||
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len) |
||||
{ |
||||
union { |
||||
vector float v; |
||||
float s[4]; |
||||
} mul_u; |
||||
int i; |
||||
vector float src1, src2, dst1, dst2, mul_v, zero; |
||||
|
||||
zero = (vector float)vec_splat_u32(0); |
||||
mul_u.s[0] = mul; |
||||
mul_v = vec_splat(mul_u.v, 0); |
||||
|
||||
for(i=0; i<len; i+=8) { |
||||
src1 = vec_ctf(vec_ld(0, src+i), 0); |
||||
src2 = vec_ctf(vec_ld(16, src+i), 0); |
||||
dst1 = vec_madd(src1, mul_v, zero); |
||||
dst2 = vec_madd(src2, mul_v, zero); |
||||
vec_st(dst1, 0, dst+i); |
||||
vec_st(dst2, 16, dst+i); |
||||
} |
||||
} |
||||
|
||||
|
||||
static vector signed short |
||||
float_to_int16_one_altivec(const float *src) |
||||
{ |
||||
vector float s0 = vec_ld(0, src); |
||||
vector float s1 = vec_ld(16, src); |
||||
vector signed int t0 = vec_cts(s0, 0); |
||||
vector signed int t1 = vec_cts(s1, 0); |
||||
return vec_packs(t0,t1); |
||||
} |
||||
|
||||
static void float_to_int16_altivec(int16_t *dst, const float *src, long len) |
||||
{ |
||||
int i; |
||||
vector signed short d0, d1, d; |
||||
vector unsigned char align; |
||||
if(((long)dst)&15) //FIXME
|
||||
for(i=0; i<len-7; i+=8) { |
||||
d0 = vec_ld(0, dst+i); |
||||
d = float_to_int16_one_altivec(src+i); |
||||
d1 = vec_ld(15, dst+i); |
||||
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); |
||||
align = vec_lvsr(0, dst+i); |
||||
d0 = vec_perm(d1, d, align); |
||||
d1 = vec_perm(d, d1, align); |
||||
vec_st(d0, 0, dst+i); |
||||
vec_st(d1,15, dst+i); |
||||
} |
||||
else |
||||
for(i=0; i<len-7; i+=8) { |
||||
d = float_to_int16_one_altivec(src+i); |
||||
vec_st(d, 0, dst+i); |
||||
} |
||||
} |
||||
|
||||
static void |
||||
float_to_int16_interleave_altivec(int16_t *dst, const float **src, |
||||
long len, int channels) |
||||
{ |
||||
int i; |
||||
vector signed short d0, d1, d2, c0, c1, t0, t1; |
||||
vector unsigned char align; |
||||
if(channels == 1) |
||||
float_to_int16_altivec(dst, src[0], len); |
||||
else |
||||
if (channels == 2) { |
||||
if(((long)dst)&15) |
||||
for(i=0; i<len-7; i+=8) { |
||||
d0 = vec_ld(0, dst + i); |
||||
t0 = float_to_int16_one_altivec(src[0] + i); |
||||
d1 = vec_ld(31, dst + i); |
||||
t1 = float_to_int16_one_altivec(src[1] + i); |
||||
c0 = vec_mergeh(t0, t1); |
||||
c1 = vec_mergel(t0, t1); |
||||
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); |
||||
align = vec_lvsr(0, dst + i); |
||||
d0 = vec_perm(d2, c0, align); |
||||
d1 = vec_perm(c0, c1, align); |
||||
vec_st(d0, 0, dst + i); |
||||
d0 = vec_perm(c1, d2, align); |
||||
vec_st(d1, 15, dst + i); |
||||
vec_st(d0, 31, dst + i); |
||||
dst+=8; |
||||
} |
||||
else |
||||
for(i=0; i<len-7; i+=8) { |
||||
t0 = float_to_int16_one_altivec(src[0] + i); |
||||
t1 = float_to_int16_one_altivec(src[1] + i); |
||||
d0 = vec_mergeh(t0, t1); |
||||
d1 = vec_mergel(t0, t1); |
||||
vec_st(d0, 0, dst + i); |
||||
vec_st(d1, 16, dst + i); |
||||
dst+=8; |
||||
} |
||||
} else { |
||||
DECLARE_ALIGNED(16, int16_t, tmp)[len]; |
||||
int c, j; |
||||
for (c = 0; c < channels; c++) { |
||||
float_to_int16_altivec(tmp, src[c], len); |
||||
for (i = 0, j = c; i < len; i++, j+=channels) { |
||||
dst[j] = tmp[i]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx) |
||||
{ |
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; |
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
||||
c->float_to_int16 = float_to_int16_altivec; |
||||
c->float_to_int16_interleave = float_to_int16_interleave_altivec; |
||||
} |
||||
} |
@ -0,0 +1,91 @@ |
||||
;****************************************************************************** |
||||
;* x86 optimized Format Conversion Utils |
||||
;* Copyright (c) 2008 Loren Merritt |
||||
;* |
||||
;* This file is part of FFmpeg. |
||||
;* |
||||
;* FFmpeg is free software; you can redistribute it and/or |
||||
;* modify it under the terms of the GNU Lesser General Public |
||||
;* License as published by the Free Software Foundation; either |
||||
;* version 2.1 of the License, or (at your option) any later version. |
||||
;* |
||||
;* FFmpeg is distributed in the hope that it will be useful, |
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
;* Lesser General Public License for more details. |
||||
;* |
||||
;* You should have received a copy of the GNU Lesser General Public |
||||
;* License along with FFmpeg; if not, write to the Free Software |
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
;****************************************************************************** |
||||
|
||||
%include "x86inc.asm" |
||||
|
||||
section .text align=16 |
||||
|
||||
%macro PSWAPD_SSE 2 |
||||
pshufw %1, %2, 0x4e |
||||
%endmacro |
||||
%macro PSWAPD_3DN1 2 |
||||
movq %1, %2 |
||||
psrlq %1, 32 |
||||
punpckldq %1, %2 |
||||
%endmacro |
||||
|
||||
%macro FLOAT_TO_INT16_INTERLEAVE6 1 |
||||
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |
||||
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |
||||
%ifdef ARCH_X86_64 |
||||
%define lend r10d |
||||
mov lend, r2d |
||||
%else |
||||
%define lend dword r2m |
||||
%endif |
||||
mov src1q, [srcq+1*gprsize] |
||||
mov src2q, [srcq+2*gprsize] |
||||
mov src3q, [srcq+3*gprsize] |
||||
mov src4q, [srcq+4*gprsize] |
||||
mov src5q, [srcq+5*gprsize] |
||||
mov srcq, [srcq] |
||||
sub src1q, srcq |
||||
sub src2q, srcq |
||||
sub src3q, srcq |
||||
sub src4q, srcq |
||||
sub src5q, srcq |
||||
.loop: |
||||
cvtps2pi mm0, [srcq] |
||||
cvtps2pi mm1, [srcq+src1q] |
||||
cvtps2pi mm2, [srcq+src2q] |
||||
cvtps2pi mm3, [srcq+src3q] |
||||
cvtps2pi mm4, [srcq+src4q] |
||||
cvtps2pi mm5, [srcq+src5q] |
||||
packssdw mm0, mm3 |
||||
packssdw mm1, mm4 |
||||
packssdw mm2, mm5 |
||||
pswapd mm3, mm0 |
||||
punpcklwd mm0, mm1 |
||||
punpckhwd mm1, mm2 |
||||
punpcklwd mm2, mm3 |
||||
pswapd mm3, mm0 |
||||
punpckldq mm0, mm2 |
||||
punpckhdq mm2, mm1 |
||||
punpckldq mm1, mm3 |
||||
movq [dstq ], mm0 |
||||
movq [dstq+16], mm2 |
||||
movq [dstq+ 8], mm1 |
||||
add srcq, 8 |
||||
add dstq, 24 |
||||
sub lend, 2 |
||||
jg .loop |
||||
emms |
||||
RET |
||||
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |
||||
|
||||
%define pswapd PSWAPD_SSE |
||||
FLOAT_TO_INT16_INTERLEAVE6 sse |
||||
%define cvtps2pi pf2id |
||||
%define pswapd PSWAPD_3DN1 |
||||
FLOAT_TO_INT16_INTERLEAVE6 3dnow |
||||
%undef pswapd |
||||
FLOAT_TO_INT16_INTERLEAVE6 3dn2 |
||||
%undef cvtps2pi |
@ -0,0 +1,266 @@ |
||||
/*
|
||||
* Format Conversion Utils |
||||
* Copyright (c) 2000, 2001 Fabrice Bellard |
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
* |
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
||||
*/ |
||||
|
||||
#include "libavutil/cpu.h" |
||||
#include "libavutil/x86_cpu.h" |
||||
#include "libavcodec/fmtconvert.h" |
||||
|
||||
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) |
||||
{ |
||||
x86_reg i = -4*len; |
||||
__asm__ volatile( |
||||
"movss %3, %%xmm4 \n" |
||||
"shufps $0, %%xmm4, %%xmm4 \n" |
||||
"1: \n" |
||||
"cvtpi2ps (%2,%0), %%xmm0 \n" |
||||
"cvtpi2ps 8(%2,%0), %%xmm1 \n" |
||||
"cvtpi2ps 16(%2,%0), %%xmm2 \n" |
||||
"cvtpi2ps 24(%2,%0), %%xmm3 \n" |
||||
"movlhps %%xmm1, %%xmm0 \n" |
||||
"movlhps %%xmm3, %%xmm2 \n" |
||||
"mulps %%xmm4, %%xmm0 \n" |
||||
"mulps %%xmm4, %%xmm2 \n" |
||||
"movaps %%xmm0, (%1,%0) \n" |
||||
"movaps %%xmm2, 16(%1,%0) \n" |
||||
"add $32, %0 \n" |
||||
"jl 1b \n" |
||||
:"+r"(i) |
||||
:"r"(dst+len), "r"(src+len), "m"(mul) |
||||
); |
||||
} |
||||
|
||||
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) |
||||
{ |
||||
x86_reg i = -4*len; |
||||
__asm__ volatile( |
||||
"movss %3, %%xmm4 \n" |
||||
"shufps $0, %%xmm4, %%xmm4 \n" |
||||
"1: \n" |
||||
"cvtdq2ps (%2,%0), %%xmm0 \n" |
||||
"cvtdq2ps 16(%2,%0), %%xmm1 \n" |
||||
"mulps %%xmm4, %%xmm0 \n" |
||||
"mulps %%xmm4, %%xmm1 \n" |
||||
"movaps %%xmm0, (%1,%0) \n" |
||||
"movaps %%xmm1, 16(%1,%0) \n" |
||||
"add $32, %0 \n" |
||||
"jl 1b \n" |
||||
:"+r"(i) |
||||
:"r"(dst+len), "r"(src+len), "m"(mul) |
||||
); |
||||
} |
||||
|
||||
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ |
||||
x86_reg reglen = len; |
||||
// not bit-exact: pf2id uses different rounding than C and SSE
|
||||
__asm__ volatile( |
||||
"add %0 , %0 \n\t" |
||||
"lea (%2,%0,2) , %2 \n\t" |
||||
"add %0 , %1 \n\t" |
||||
"neg %0 \n\t" |
||||
"1: \n\t" |
||||
"pf2id (%2,%0,2) , %%mm0 \n\t" |
||||
"pf2id 8(%2,%0,2) , %%mm1 \n\t" |
||||
"pf2id 16(%2,%0,2) , %%mm2 \n\t" |
||||
"pf2id 24(%2,%0,2) , %%mm3 \n\t" |
||||
"packssdw %%mm1 , %%mm0 \n\t" |
||||
"packssdw %%mm3 , %%mm2 \n\t" |
||||
"movq %%mm0 , (%1,%0) \n\t" |
||||
"movq %%mm2 , 8(%1,%0) \n\t" |
||||
"add $16 , %0 \n\t" |
||||
" js 1b \n\t" |
||||
"femms \n\t" |
||||
:"+r"(reglen), "+r"(dst), "+r"(src) |
||||
); |
||||
} |
||||
|
||||
static void float_to_int16_sse(int16_t *dst, const float *src, long len){ |
||||
x86_reg reglen = len; |
||||
__asm__ volatile( |
||||
"add %0 , %0 \n\t" |
||||
"lea (%2,%0,2) , %2 \n\t" |
||||
"add %0 , %1 \n\t" |
||||
"neg %0 \n\t" |
||||
"1: \n\t" |
||||
"cvtps2pi (%2,%0,2) , %%mm0 \n\t" |
||||
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" |
||||
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" |
||||
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" |
||||
"packssdw %%mm1 , %%mm0 \n\t" |
||||
"packssdw %%mm3 , %%mm2 \n\t" |
||||
"movq %%mm0 , (%1,%0) \n\t" |
||||
"movq %%mm2 , 8(%1,%0) \n\t" |
||||
"add $16 , %0 \n\t" |
||||
" js 1b \n\t" |
||||
"emms \n\t" |
||||
:"+r"(reglen), "+r"(dst), "+r"(src) |
||||
); |
||||
} |
||||
|
||||
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ |
||||
x86_reg reglen = len; |
||||
__asm__ volatile( |
||||
"add %0 , %0 \n\t" |
||||
"lea (%2,%0,2) , %2 \n\t" |
||||
"add %0 , %1 \n\t" |
||||
"neg %0 \n\t" |
||||
"1: \n\t" |
||||
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t" |
||||
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" |
||||
"packssdw %%xmm1 , %%xmm0 \n\t" |
||||
"movdqa %%xmm0 , (%1,%0) \n\t" |
||||
"add $16 , %0 \n\t" |
||||
" js 1b \n\t" |
||||
:"+r"(reglen), "+r"(dst), "+r"(src) |
||||
); |
||||
} |
||||
|
||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); |
||||
|
||||
#if !HAVE_YASM |
||||
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) |
||||
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
||||
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) |
||||
#endif |
||||
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse |
||||
|
||||
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ |
||||
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
||||
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
|
||||
int i,j,c;\
|
||||
for(c=0; c<channels; c++){\
|
||||
float_to_int16_##cpu(tmp, src[c], len);\
|
||||
for(i=0, j=c; i<len; i++, j+=channels)\
|
||||
dst[j] = tmp[i];\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||
if(channels==1)\
|
||||
float_to_int16_##cpu(dst, src[0], len);\
|
||||
else if(channels==2){\
|
||||
x86_reg reglen = len; \
|
||||
const float *src0 = src[0];\
|
||||
const float *src1 = src[1];\
|
||||
__asm__ volatile(\
|
||||
"shl $2, %0 \n"\
|
||||
"add %0, %1 \n"\
|
||||
"add %0, %2 \n"\
|
||||
"add %0, %3 \n"\
|
||||
"neg %0 \n"\
|
||||
body\
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
|
||||
);\
|
||||
}else if(channels==6){\
|
||||
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
|
||||
}else\
|
||||
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
|
||||
} |
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(3dnow, |
||||
"1: \n" |
||||
"pf2id (%2,%0), %%mm0 \n" |
||||
"pf2id 8(%2,%0), %%mm1 \n" |
||||
"pf2id (%3,%0), %%mm2 \n" |
||||
"pf2id 8(%3,%0), %%mm3 \n" |
||||
"packssdw %%mm1, %%mm0 \n" |
||||
"packssdw %%mm3, %%mm2 \n" |
||||
"movq %%mm0, %%mm1 \n" |
||||
"punpcklwd %%mm2, %%mm0 \n" |
||||
"punpckhwd %%mm2, %%mm1 \n" |
||||
"movq %%mm0, (%1,%0)\n" |
||||
"movq %%mm1, 8(%1,%0)\n" |
||||
"add $16, %0 \n" |
||||
"js 1b \n" |
||||
"femms \n" |
||||
) |
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(sse, |
||||
"1: \n" |
||||
"cvtps2pi (%2,%0), %%mm0 \n" |
||||
"cvtps2pi 8(%2,%0), %%mm1 \n" |
||||
"cvtps2pi (%3,%0), %%mm2 \n" |
||||
"cvtps2pi 8(%3,%0), %%mm3 \n" |
||||
"packssdw %%mm1, %%mm0 \n" |
||||
"packssdw %%mm3, %%mm2 \n" |
||||
"movq %%mm0, %%mm1 \n" |
||||
"punpcklwd %%mm2, %%mm0 \n" |
||||
"punpckhwd %%mm2, %%mm1 \n" |
||||
"movq %%mm0, (%1,%0)\n" |
||||
"movq %%mm1, 8(%1,%0)\n" |
||||
"add $16, %0 \n" |
||||
"js 1b \n" |
||||
"emms \n" |
||||
) |
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(sse2, |
||||
"1: \n" |
||||
"cvtps2dq (%2,%0), %%xmm0 \n" |
||||
"cvtps2dq (%3,%0), %%xmm1 \n" |
||||
"packssdw %%xmm1, %%xmm0 \n" |
||||
"movhlps %%xmm0, %%xmm1 \n" |
||||
"punpcklwd %%xmm1, %%xmm0 \n" |
||||
"movdqa %%xmm0, (%1,%0) \n" |
||||
"add $16, %0 \n" |
||||
"js 1b \n" |
||||
) |
||||
|
||||
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ |
||||
if(channels==6) |
||||
ff_float_to_int16_interleave6_3dn2(dst, src, len); |
||||
else |
||||
float_to_int16_interleave_3dnow(dst, src, len, channels); |
||||
} |
||||
|
||||
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) |
||||
{ |
||||
int mm_flags = av_get_cpu_flags(); |
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) { |
||||
|
||||
if(mm_flags & AV_CPU_FLAG_3DNOW){ |
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
||||
c->float_to_int16 = float_to_int16_3dnow; |
||||
c->float_to_int16_interleave = float_to_int16_interleave_3dnow; |
||||
} |
||||
} |
||||
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ |
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
||||
c->float_to_int16_interleave = float_to_int16_interleave_3dn2; |
||||
} |
||||
} |
||||
if(mm_flags & AV_CPU_FLAG_SSE){ |
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; |
||||
c->float_to_int16 = float_to_int16_sse; |
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse; |
||||
} |
||||
if(mm_flags & AV_CPU_FLAG_SSE2){ |
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; |
||||
c->float_to_int16 = float_to_int16_sse2; |
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
||||
} |
||||
} |
||||
} |
Loading…
Reference in new issue