diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c index fb41918265..6329295c9a 100644 --- a/libavcodec/apedec.c +++ b/libavcodec/apedec.c @@ -25,6 +25,7 @@ #include "libavutil/avassert.h" #include "libavutil/channel_layout.h" #include "libavutil/opt.h" +#include "apedsp.h" #include "avcodec.h" #include "dsputil.h" #include "bytestream.h" @@ -136,6 +137,7 @@ typedef struct APEContext { AVClass *class; ///< class for AVOptions AVCodecContext *avctx; DSPContext dsp; + APEDSPContext adsp; int channels; int samples; ///< samples left to decode in current frame int bps; @@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count); static void predictor_decode_mono_3950(APEContext *ctx, int count); static void predictor_decode_stereo_3950(APEContext *ctx, int count); -// TODO: dsputilize - static av_cold int ape_decode_close(AVCodecContext *avctx) { APEContext *s = avctx->priv_data; @@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx) return 0; } +static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul) +{ + int res = 0; + + while (order--) { + res += *v1 * *v2++; + *v1++ += mul * *v3++; + } + return res; +} + static av_cold int ape_decode_init(AVCodecContext *avctx) { APEContext *s = avctx->priv_data; @@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx) s->predictor_decode_stereo = predictor_decode_stereo_3950; } + s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; + + if (ARCH_ARM) + ff_apedsp_init_arm(&s->adsp); + if (ARCH_PPC) + ff_apedsp_init_ppc(&s->adsp); + if (ARCH_X86) + ff_apedsp_init_x86(&s->adsp); + ff_dsputil_init(&s->dsp, avctx); avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO; @@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f, while (count--) { /* round fixedpoint scalar product */ - res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, - f->adaptcoeffs - order, - order, APESIGN(*data)); + res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs, + f->delay - order, + f->adaptcoeffs - order, + order, APESIGN(*data)); res = (res + (1 << (fracbits - 1))) >> fracbits; res += *data; *data++ = res; diff --git a/libavcodec/apedsp.h b/libavcodec/apedsp.h new file mode 100644 index 0000000000..64e2749679 --- /dev/null +++ b/libavcodec/apedsp.h @@ -0,0 +1,44 @@ +/* + * Monkey's Audio lossless audio decoder + * Copyright (c) 2007 Benjamin Zores + * based upon libdemac from Dave Chapman. + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_APEDSP_H +#define AVCODEC_APEDSP_H + +#include + +typedef struct APEDSPContext { + /** + * Calculate scalar product of v1 and v2, + * and v1[i] += v3[i] * mul + * @param len length of vectors, should be multiple of 16 + */ + int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */, + const int16_t *v2, + const int16_t *v3, + int len, int mul); +} APEDSPContext; + +void ff_apedsp_init_arm(APEDSPContext *c); +void ff_apedsp_init_ppc(APEDSPContext *c); +void ff_apedsp_init_x86(APEDSPContext *c); + +#endif /* AVCODEC_APEDSP_H */ diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 9d5b6aab5b..13025af9c1 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ arm/sbrdsp_init_arm.o +OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \ arm/flacdsp_arm.o @@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ arm/sbrdsp_neon.o +NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/apedsp_init_arm.c new file mode 100644 index 0000000000..47ea034359 --- /dev/null +++ b/libavcodec/arm/apedsp_init_arm.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/apedsp.h" + +int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, + const int16_t *v3, int len, int mul); + +av_cold void ff_apedsp_init_arm(APEDSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; + } +} diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/apedsp_neon.S new file mode 100644 index 0000000000..7cfbf43c6d --- /dev/null +++ b/libavcodec/arm/apedsp_neon.S @@ -0,0 +1,62 @@ +/* + * ARM NEON optimised integer operations + * Copyright (c) 2009 Kostya Shishkov + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) +function ff_scalarproduct_and_madd_int16_neon, export=1 + vld1.16 {d28[],d29[]}, [sp] + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vmov.i16 q2, #0 + vmov.i16 q3, #0 + mov r12, r0 + +1: vld1.16 {d16-d17}, [r0,:128]! + vld1.16 {d18-d19}, [r1]! + vld1.16 {d20-d21}, [r2]! + vld1.16 {d22-d23}, [r0,:128]! + vld1.16 {d24-d25}, [r1]! + vld1.16 {d26-d27}, [r2]! + vmul.s16 q10, q10, q14 + vmul.s16 q13, q13, q14 + vmlal.s16 q0, d16, d18 + vmlal.s16 q1, d17, d19 + vadd.s16 q10, q8, q10 + vadd.s16 q13, q11, q13 + vmlal.s16 q2, d22, d24 + vmlal.s16 q3, d23, d25 + vst1.16 {q10}, [r12,:128]! + subs r3, r3, #16 + vst1.16 {q13}, [r12,:128]! + bne 1b + + vpadd.s32 d16, d0, d1 + vpadd.s32 d17, d2, d3 + vpadd.s32 d18, d4, d5 + vpadd.s32 d19, d6, d7 + vpadd.s32 d0, d16, d17 + vpadd.s32 d1, d18, d19 + vpadd.s32 d2, d0, d1 + vpaddl.s32 d3, d2 + vmov.32 r0, d3[0] + bx lr +endfunc diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 16e052dddd..c9bdaa5a78 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len); -int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, - const int16_t *v3, int len, int mul); - av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { @@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx, c->vector_clip_int32 = ff_vector_clip_int32_neon; c->scalarproduct_int16 = ff_scalarproduct_int16_neon; - - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; } diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S index 3d2faffa48..42f37392e1 100644 --- a/libavcodec/arm/int_neon.S +++ b/libavcodec/arm/int_neon.S @@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1 vmov.32 r0, d3[0] bx lr endfunc - -@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) -function ff_scalarproduct_and_madd_int16_neon, export=1 - vld1.16 {d28[],d29[]}, [sp] - vmov.i16 q0, #0 - vmov.i16 q1, #0 - vmov.i16 q2, #0 - vmov.i16 q3, #0 - mov r12, r0 - -1: vld1.16 {d16-d17}, [r0,:128]! - vld1.16 {d18-d19}, [r1]! - vld1.16 {d20-d21}, [r2]! - vld1.16 {d22-d23}, [r0,:128]! - vld1.16 {d24-d25}, [r1]! - vld1.16 {d26-d27}, [r2]! - vmul.s16 q10, q10, q14 - vmul.s16 q13, q13, q14 - vmlal.s16 q0, d16, d18 - vmlal.s16 q1, d17, d19 - vadd.s16 q10, q8, q10 - vadd.s16 q13, q11, q13 - vmlal.s16 q2, d22, d24 - vmlal.s16 q3, d23, d25 - vst1.16 {q10}, [r12,:128]! - subs r3, r3, #16 - vst1.16 {q13}, [r12,:128]! - bne 1b - - vpadd.s32 d16, d0, d1 - vpadd.s32 d17, d2, d3 - vpadd.s32 d18, d4, d5 - vpadd.s32 d19, d6, d7 - vpadd.s32 d0, d16, d17 - vpadd.s32 d1, d18, d19 - vpadd.s32 d2, d0, d1 - vpaddl.s32 d3, d2 - vmov.32 r0, d3[0] - bx lr -endfunc diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 11447c01e8..6b846588fa 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2, return res; } -static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul) -{ - int res = 0; - - while (order--) { - res += *v1 * *v2++; - *v1++ += mul * *v3++; - } - return res; -} - static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len) { @@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) c->try_8x8basis = try_8x8basis_c; c->add_8x8basis = add_8x8basis_c; - c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; - c->scalarproduct_int16 = scalarproduct_int16_c; c->vector_clip_int32 = vector_clip_int32_c; c->vector_clipf = vector_clipf_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index d261f7e702..471988bddd 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -255,16 +255,6 @@ typedef struct DSPContext { */ int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2 /* align 16 */, int len); - /* ape functions */ - /** - * Calculate scalar product of v1 and v2, - * and v1[i] += v3[i] * mul - * @param len length of vectors, should be multiple of 16 - */ - int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */, - const int16_t *v2, - const int16_t *v3, - int len, int mul); /** * Clip each element in an array of int32_t to a given minimum and diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index ec0674c817..b78d4be8ae 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o +OBJS-$(CONFIG_APE_DECODER) += ppc/apedsp_altivec.o OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o diff --git a/libavcodec/ppc/apedsp_altivec.c b/libavcodec/ppc/apedsp_altivec.c new file mode 100644 index 0000000000..de9df45c6c --- /dev/null +++ b/libavcodec/ppc/apedsp_altivec.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2007 Luca Barbato + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#if HAVE_ALTIVEC_H +#include +#endif + +#include "libavutil/attributes.h" +#include "libavutil/ppc/types_altivec.h" +#include "libavcodec/apedsp.h" + +#if HAVE_ALTIVEC +static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, + const int16_t *v2, + const int16_t *v3, + int order, int mul) +{ + LOAD_ZERO; + vec_s16 *pv1 = (vec_s16 *) v1; + register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul }; + register vec_s16 t0, t1, i0, i1, i4; + register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3); + register vec_s32 res = zero_s32v; + register vec_u8 align = vec_lvsl(0, v2); + int32_t ires; + + order >>= 4; + do { + i1 = vec_ld(16, v2); + t0 = vec_perm(i2, i1, align); + i2 = vec_ld(32, v2); + t1 = vec_perm(i1, i2, align); + i0 = pv1[0]; + i1 = pv1[1]; + res = vec_msum(t0, i0, res); + res = vec_msum(t1, i1, res); + i4 = vec_ld(16, v3); + t0 = vec_perm(i3, i4, align); + i3 = vec_ld(32, v3); + t1 = vec_perm(i4, i3, align); + pv1[0] = vec_mladd(t0, muls, i0); + pv1[1] = vec_mladd(t1, muls, i1); + pv1 += 2; + v2 += 16; + v3 += 16; + } while (--order); + res = vec_splat(vec_sums(res, zero_s32v), 3); + vec_ste(res, 0, &ires); + + return ires; +} +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_apedsp_init_ppc(APEDSPContext *c) +{ +#if HAVE_ALTIVEC + c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec; +#endif /* HAVE_ALTIVEC */ +} diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/int_altivec.c index fa3cb66095..d76d34a5b1 100644 --- a/libavcodec/ppc/int_altivec.c +++ b/libavcodec/ppc/int_altivec.c @@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2, return ires; } -static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, - const int16_t *v2, - const int16_t *v3, - int order, int mul) -{ - LOAD_ZERO; - vec_s16 *pv1 = (vec_s16 *) v1; - register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul }; - register vec_s16 t0, t1, i0, i1, i4; - register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3); - register vec_s32 res = zero_s32v; - register vec_u8 align = vec_lvsl(0, v2); - int32_t ires; - - order >>= 4; - do { - i1 = vec_ld(16, v2); - t0 = vec_perm(i2, i1, align); - i2 = vec_ld(32, v2); - t1 = vec_perm(i1, i2, align); - i0 = pv1[0]; - i1 = pv1[1]; - res = vec_msum(t0, i0, res); - res = vec_msum(t1, i1, res); - i4 = vec_ld(16, v3); - t0 = vec_perm(i3, i4, align); - i3 = vec_ld(32, v3); - t1 = vec_perm(i4, i3, align); - pv1[0] = vec_mladd(t0, muls, i0); - pv1[1] = vec_mladd(t1, muls, i1); - pv1 += 2; - v2 += 16; - v3 += 16; - } while (--order); - res = vec_splat(vec_sums(res, zero_s32v), 3); - vec_ste(res, 0, &ires); - - return ires; -} - av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx) { c->scalarproduct_int16 = scalarproduct_int16_altivec; - - c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec; } diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 8830a22a8f..10242269c2 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o +OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o @@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o +YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/apedsp.asm new file mode 100644 index 0000000000..d721ebda6b --- /dev/null +++ b/libavcodec/x86/apedsp.asm @@ -0,0 +1,167 @@ +;****************************************************************************** +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +%macro SCALARPRODUCT 0 +; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, +; int order, int mul) +cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm +%if mmsize == 16 + pshuflw m7, m7, 0 + punpcklqdq m7, m7 +%else + pshufw m7, m7, 0 +%endif + pxor m6, m6 + add v1q, orderq + add v2q, orderq + add v3q, orderq + neg orderq +.loop: + movu m0, [v2q + orderq] + movu m1, [v2q + orderq + mmsize] + mova m4, [v1q + orderq] + mova m5, [v1q + orderq + mmsize] + movu m2, [v3q + orderq] + movu m3, [v3q + orderq + mmsize] + pmaddwd m0, m4 + pmaddwd m1, m5 + pmullw m2, m7 + pmullw m3, m7 + paddd m6, m0 + paddd m6, m1 + paddw m2, m4 + paddw m3, m5 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + add orderq, mmsize*2 + jl .loop +%if mmsize == 16 + movhlps m0, m6 + paddd m6, m0 + pshuflw m0, m6, 0x4e +%else + pshufw m0, m6, 0x4e +%endif + paddd m6, m0 + movd eax, m6 + RET +%endmacro + +INIT_MMX mmxext +SCALARPRODUCT +INIT_XMM sse2 +SCALARPRODUCT + +%macro SCALARPRODUCT_LOOP 1 +align 16 +.loop%1: + sub orderq, mmsize*2 +%if %1 + mova m1, m4 + mova m4, [v2q + orderq] + mova m0, [v2q + orderq + mmsize] + palignr m1, m0, %1 + palignr m0, m4, %1 + mova m3, m5 + mova m5, [v3q + orderq] + mova m2, [v3q + orderq + mmsize] + palignr m3, m2, %1 + palignr m2, m5, %1 +%else + mova m0, [v2q + orderq] + mova m1, [v2q + orderq + mmsize] + mova m2, [v3q + orderq] + mova m3, [v3q + orderq + mmsize] +%endif + %define t0 [v1q + orderq] + %define t1 [v1q + orderq + mmsize] +%if ARCH_X86_64 + mova m8, t0 + mova m9, t1 + %define t0 m8 + %define t1 m9 +%endif + pmaddwd m0, t0 + pmaddwd m1, t1 + pmullw m2, m7 + pmullw m3, m7 + paddw m2, t0 + paddw m3, t1 + paddd m6, m0 + paddd m6, m1 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + jg .loop%1 +%if %1 + jmp .end +%endif +%endmacro + +; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, +; int order, int mul) +INIT_XMM ssse3 +cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm + pshuflw m7, m7, 0 + punpcklqdq m7, m7 + pxor m6, m6 + mov r4d, v2d + and r4d, 15 + and v2q, ~15 + and v3q, ~15 + mova m4, [v2q + orderq] + mova m5, [v3q + orderq] + ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) + cmp r4d, 0 + je .loop0 + cmp r4d, 2 + je .loop2 + cmp r4d, 4 + je .loop4 + cmp r4d, 6 + je .loop6 + cmp r4d, 8 + je .loop8 + cmp r4d, 10 + je .loop10 + cmp r4d, 12 + je .loop12 +SCALARPRODUCT_LOOP 14 +SCALARPRODUCT_LOOP 12 +SCALARPRODUCT_LOOP 10 +SCALARPRODUCT_LOOP 8 +SCALARPRODUCT_LOOP 6 +SCALARPRODUCT_LOOP 4 +SCALARPRODUCT_LOOP 2 +SCALARPRODUCT_LOOP 0 +.end: + movhlps m0, m6 + paddd m6, m0 + pshuflw m0, m6, 0x4e + paddd m6, m0 + movd eax, m6 + RET diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/apedsp_init.c new file mode 100644 index 0000000000..f692c2b9b6 --- /dev/null +++ b/libavcodec/x86/apedsp_init.c @@ -0,0 +1,47 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/apedsp.h" + +int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, + const int16_t *v3, + int order, int mul); + +av_cold void ff_apedsp_init_x86(APEDSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMXEXT(cpu_flags)) + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; + + if (EXTERNAL_SSE2(cpu_flags)) + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; + + if (EXTERNAL_SSSE3(cpu_flags) && + !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; +} diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 684f09b7fc..b5d6d3cc65 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order paddd m2, m0 movd eax, m2 RET - -; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, -; int order, int mul) -cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm -%if mmsize == 16 - pshuflw m7, m7, 0 - punpcklqdq m7, m7 -%else - pshufw m7, m7, 0 -%endif - pxor m6, m6 - add v1q, orderq - add v2q, orderq - add v3q, orderq - neg orderq -.loop: - movu m0, [v2q + orderq] - movu m1, [v2q + orderq + mmsize] - mova m4, [v1q + orderq] - mova m5, [v1q + orderq + mmsize] - movu m2, [v3q + orderq] - movu m3, [v3q + orderq + mmsize] - pmaddwd m0, m4 - pmaddwd m1, m5 - pmullw m2, m7 - pmullw m3, m7 - paddd m6, m0 - paddd m6, m1 - paddw m2, m4 - paddw m3, m5 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - add orderq, mmsize*2 - jl .loop -%if mmsize == 16 - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e -%else - pshufw m0, m6, 0x4e -%endif - paddd m6, m0 - movd eax, m6 - RET %endmacro INIT_MMX mmxext @@ -106,97 +60,6 @@ SCALARPRODUCT INIT_XMM sse2 SCALARPRODUCT -%macro SCALARPRODUCT_LOOP 1 -align 16 -.loop%1: - sub orderq, mmsize*2 -%if %1 - mova m1, m4 - mova m4, [v2q + orderq] - mova m0, [v2q + orderq + mmsize] - palignr m1, m0, %1 - palignr m0, m4, %1 - mova m3, m5 - mova m5, [v3q + orderq] - mova m2, [v3q + orderq + mmsize] - palignr m3, m2, %1 - palignr m2, m5, %1 -%else - mova m0, [v2q + orderq] - mova m1, [v2q + orderq + mmsize] - mova m2, [v3q + orderq] - mova m3, [v3q + orderq + mmsize] -%endif - %define t0 [v1q + orderq] - %define t1 [v1q + orderq + mmsize] -%if ARCH_X86_64 - mova m8, t0 - mova m9, t1 - %define t0 m8 - %define t1 m9 -%endif - pmaddwd m0, t0 - pmaddwd m1, t1 - pmullw m2, m7 - pmullw m3, m7 - paddw m2, t0 - paddw m3, t1 - paddd m6, m0 - paddd m6, m1 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - jg .loop%1 -%if %1 - jmp .end -%endif -%endmacro - -; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, -; int order, int mul) -INIT_XMM ssse3 -cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm - pshuflw m7, m7, 0 - punpcklqdq m7, m7 - pxor m6, m6 - mov r4d, v2d - and r4d, 15 - and v2q, ~15 - and v3q, ~15 - mova m4, [v2q + orderq] - mova m5, [v3q + orderq] - ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) - cmp r4d, 0 - je .loop0 - cmp r4d, 2 - je .loop2 - cmp r4d, 4 - je .loop4 - cmp r4d, 6 - je .loop6 - cmp r4d, 8 - je .loop8 - cmp r4d, 10 - je .loop10 - cmp r4d, 12 - je .loop12 -SCALARPRODUCT_LOOP 14 -SCALARPRODUCT_LOOP 12 -SCALARPRODUCT_LOOP 10 -SCALARPRODUCT_LOOP 8 -SCALARPRODUCT_LOOP 6 -SCALARPRODUCT_LOOP 4 -SCALARPRODUCT_LOOP 2 -SCALARPRODUCT_LOOP 0 -.end: - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e - paddd m6, m0 - movd eax, m6 - RET - ;----------------------------------------------------------------------------- ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 10fa166db4..9b0788ff73 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, int order); int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order); -int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); -int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, - const int16_t *v3, - int order, int mul); void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); @@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; #endif /* HAVE_MMXEXT_EXTERNAL */ } @@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, #if HAVE_SSE2_EXTERNAL c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; if (cpu_flags & AV_CPU_FLAG_ATOM) { c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; } else { @@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, int cpu_flags, unsigned high_bit_depth) { #if HAVE_SSSE3_EXTERNAL - if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; c->bswap_buf = ff_bswap32_buf_ssse3; #endif /* HAVE_SSSE3_EXTERNAL */ }