From d56668bd80075615b89aff652fe8a576bf853ceb Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sun, 20 Jan 2013 15:41:52 -0800
Subject: [PATCH 1/2] floatdsp: move scalarproduct_float from dsputil to
 avfloatdsp.

This makes the aac decoder and all voice codecs independent of dsputil.
---
 libavcodec/aac.h                    |  1 -
 libavcodec/aacdec.c                 |  3 +--
 libavcodec/acelp_pitch_delay.c      |  4 ++--
 libavcodec/acelp_vectors.c          |  6 +++---
 libavcodec/amrnbdec.c               | 20 ++++++++---------
 libavcodec/amrwbdec.c               | 33 +++++++++++++++--------------
 libavcodec/arm/dsputil_init_neon.c  |  3 ---
 libavcodec/arm/dsputil_neon.S       | 13 ------------
 libavcodec/dsputil.c                | 12 -----------
 libavcodec/dsputil.h                | 18 ----------------
 libavcodec/qcelpdec.c               | 17 +++++++--------
 libavcodec/ra288.c                  |  4 ++--
 libavcodec/sipr.c                   | 15 +++++++------
 libavcodec/sipr16k.c                |  8 +++----
 libavcodec/wmavoice.c               | 16 ++++++++------
 libavcodec/x86/dsputil.asm          | 26 -----------------------
 libavcodec/x86/dsputil_mmx.c        |  6 ------
 libavutil/arm/float_dsp_init_neon.c |  3 +++
 libavutil/arm/float_dsp_neon.S      | 13 ++++++++++++
 libavutil/float_dsp.c               | 12 +++++++++++
 libavutil/float_dsp.h               | 22 +++++++++++++++++++
 libavutil/x86/float_dsp.asm         | 25 ++++++++++++++++++++++
 libavutil/x86/float_dsp_init.c      |  3 +++
 23 files changed, 142 insertions(+), 141 deletions(-)

diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index 6c5d962dd8..dd337a0a75 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -291,7 +291,6 @@ typedef struct AACContext {
     FFTContext mdct;
     FFTContext mdct_small;
     FFTContext mdct_ltp;
-    DSPContext dsp;
     FmtConvertContext fmt_conv;
     AVFloatDSPContext fdsp;
     int random_state;
diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c
index b016611fcf..5afc9b820e 100644
--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -895,7 +895,6 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
 
     ff_aac_sbr_init();
 
-    ff_dsputil_init(&ac->dsp, avctx);
     ff_fmt_convert_init(&ac->fmt_conv, avctx);
     avpriv_float_dsp_init(&ac->fdsp, avctx->flags & CODEC_FLAG_BITEXACT);
 
@@ -1358,7 +1357,7 @@ static int decode_spectrum_and_dequant(AACContext *ac, float coef[1024],
                         cfo[k] = ac->random_state;
                     }
 
-                    band_energy = ac->dsp.scalarproduct_float(cfo, cfo, off_len);
+                    band_energy = ac->fdsp.scalarproduct_float(cfo, cfo, off_len);
                     scale = sf[idx] / sqrtf(band_energy);
                     ac->fdsp.vector_fmul_scalar(cfo, cfo, scale, off_len);
                 }
diff --git a/libavcodec/acelp_pitch_delay.c b/libavcodec/acelp_pitch_delay.c
index a9668fac70..ab09bdb6c5 100644
--- a/libavcodec/acelp_pitch_delay.c
+++ b/libavcodec/acelp_pitch_delay.c
@@ -21,9 +21,9 @@
  */
 
 #include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
 #include "libavutil/mathematics.h"
 #include "avcodec.h"
-#include "dsputil.h"
 #include "acelp_pitch_delay.h"
 #include "celp_math.h"
 
@@ -120,7 +120,7 @@ float ff_amr_set_fixed_gain(float fixed_gain_factor, float fixed_mean_energy,
     // Note 10^(0.05 * -10log(average x2)) = 1/sqrt((average x2)).
     float val = fixed_gain_factor *
         exp2f(M_LOG2_10 * 0.05 *
-              (ff_scalarproduct_float_c(pred_table, prediction_error, 4) +
+              (avpriv_scalarproduct_float_c(pred_table, prediction_error, 4) +
                energy_mean)) /
         sqrtf(fixed_mean_energy);
 
diff --git a/libavcodec/acelp_vectors.c b/libavcodec/acelp_vectors.c
index b50c5f3ffe..a85e45f4c7 100644
--- a/libavcodec/acelp_vectors.c
+++ b/libavcodec/acelp_vectors.c
@@ -23,8 +23,8 @@
 #include <inttypes.h>
 
 #include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
 #include "avcodec.h"
-#include "dsputil.h"
 #include "acelp_vectors.h"
 
 const uint8_t ff_fc_2pulses_9bits_track1[16] =
@@ -183,7 +183,7 @@ void ff_adaptive_gain_control(float *out, const float *in, float speech_energ,
                               int size, float alpha, float *gain_mem)
 {
     int i;
-    float postfilter_energ = ff_scalarproduct_float_c(in, in, size);
+    float postfilter_energ = avpriv_scalarproduct_float_c(in, in, size);
     float gain_scale_factor = 1.0;
     float mem = *gain_mem;
 
@@ -204,7 +204,7 @@ void ff_scale_vector_to_given_sum_of_squares(float *out, const float *in,
                                              float sum_of_squares, const int n)
 {
     int i;
-    float scalefactor = ff_scalarproduct_float_c(in, in, n);
+    float scalefactor = avpriv_scalarproduct_float_c(in, in, n);
     if (scalefactor)
         scalefactor = sqrt(sum_of_squares / scalefactor);
     for (i = 0; i < n; i++)
diff --git a/libavcodec/amrnbdec.c b/libavcodec/amrnbdec.c
index 5c359a8f3d..7db12dd001 100644
--- a/libavcodec/amrnbdec.c
+++ b/libavcodec/amrnbdec.c
@@ -44,8 +44,8 @@
 #include <math.h>
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/float_dsp.h"
 #include "avcodec.h"
-#include "dsputil.h"
 #include "libavutil/common.h"
 #include "celp_filters.h"
 #include "acelp_filters.h"
@@ -794,8 +794,8 @@ static int synthesis(AMRContext *p, float *lpc,
 
     // emphasize pitch vector contribution
     if (p->pitch_gain[4] > 0.5 && !overflow) {
-        float energy = ff_scalarproduct_float_c(excitation, excitation,
-                                                AMR_SUBFRAME_SIZE);
+        float energy = avpriv_scalarproduct_float_c(excitation, excitation,
+                                                    AMR_SUBFRAME_SIZE);
         float pitch_factor =
             p->pitch_gain[4] *
             (p->cur_frame_mode == MODE_12k2 ?
@@ -871,8 +871,8 @@ static float tilt_factor(float *lpc_n, float *lpc_d)
     ff_celp_lp_synthesis_filterf(hf, lpc_d, hf, AMR_TILT_RESPONSE,
                                  LP_FILTER_ORDER);
 
-    rh0 = ff_scalarproduct_float_c(hf, hf,     AMR_TILT_RESPONSE);
-    rh1 = ff_scalarproduct_float_c(hf, hf + 1, AMR_TILT_RESPONSE - 1);
+    rh0 = avpriv_scalarproduct_float_c(hf, hf,     AMR_TILT_RESPONSE);
+    rh1 = avpriv_scalarproduct_float_c(hf, hf + 1, AMR_TILT_RESPONSE - 1);
 
     // The spec only specifies this check for 12.2 and 10.2 kbit/s
     // modes. But in the ref source the tilt is always non-negative.
@@ -892,8 +892,8 @@ static void postfilter(AMRContext *p, float *lpc, float *buf_out)
     int i;
     float *samples          = p->samples_in + LP_FILTER_ORDER; // Start of input
 
-    float speech_gain       = ff_scalarproduct_float_c(samples, samples,
-                                                       AMR_SUBFRAME_SIZE);
+    float speech_gain       = avpriv_scalarproduct_float_c(samples, samples,
+                                                           AMR_SUBFRAME_SIZE);
 
     float pole_out[AMR_SUBFRAME_SIZE + LP_FILTER_ORDER];  // Output of pole filter
     const float *gamma_n, *gamma_d;                       // Formant filter factor table
@@ -998,9 +998,9 @@ static int amrnb_decode_frame(AVCodecContext *avctx, void *data,
 
         p->fixed_gain[4] =
             ff_amr_set_fixed_gain(fixed_gain_factor,
-                                  ff_scalarproduct_float_c(p->fixed_vector,
-                                                           p->fixed_vector,
-                                                           AMR_SUBFRAME_SIZE) /
+                                  avpriv_scalarproduct_float_c(p->fixed_vector,
+                                                               p->fixed_vector,
+                                                               AMR_SUBFRAME_SIZE) /
                                   AMR_SUBFRAME_SIZE,
                        p->prediction_error,
                        energy_mean[p->cur_frame_mode], energy_pred_fac);
diff --git a/libavcodec/amrwbdec.c b/libavcodec/amrwbdec.c
index 01d95f68df..553ec3dfa2 100644
--- a/libavcodec/amrwbdec.c
+++ b/libavcodec/amrwbdec.c
@@ -26,10 +26,10 @@
 
 #include "libavutil/channel_layout.h"
 #include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
 #include "libavutil/lfg.h"
 
 #include "avcodec.h"
-#include "dsputil.h"
 #include "lsp.h"
 #include "celp_filters.h"
 #include "acelp_filters.h"
@@ -595,11 +595,11 @@ static void pitch_sharpening(AMRWBContext *ctx, float *fixed_vector)
 static float voice_factor(float *p_vector, float p_gain,
                           float *f_vector, float f_gain)
 {
-    double p_ener = (double) ff_scalarproduct_float_c(p_vector, p_vector,
-                                                      AMRWB_SFR_SIZE) *
+    double p_ener = (double) avpriv_scalarproduct_float_c(p_vector, p_vector,
+                                                          AMRWB_SFR_SIZE) *
                     p_gain * p_gain;
-    double f_ener = (double) ff_scalarproduct_float_c(f_vector, f_vector,
-                                                      AMRWB_SFR_SIZE) *
+    double f_ener = (double) avpriv_scalarproduct_float_c(f_vector, f_vector,
+                                                          AMRWB_SFR_SIZE) *
                     f_gain * f_gain;
 
     return (p_ener - f_ener) / (p_ener + f_ener);
@@ -768,8 +768,8 @@ static void synthesis(AMRWBContext *ctx, float *lpc, float *excitation,
     /* emphasize pitch vector contribution in low bitrate modes */
     if (ctx->pitch_gain[0] > 0.5 && ctx->fr_cur_mode <= MODE_8k85) {
         int i;
-        float energy = ff_scalarproduct_float_c(excitation, excitation,
-                                                AMRWB_SFR_SIZE);
+        float energy = avpriv_scalarproduct_float_c(excitation, excitation,
+                                                    AMRWB_SFR_SIZE);
 
         // XXX: Weird part in both ref code and spec. A unknown parameter
         // {beta} seems to be identical to the current pitch gain
@@ -828,9 +828,9 @@ static void upsample_5_4(float *out, const float *in, int o_size)
         i++;
 
         for (k = 1; k < 5; k++) {
-            out[i] = ff_scalarproduct_float_c(in0 + int_part,
-                                              upsample_fir[4 - frac_part],
-                                              UPS_MEM_SIZE);
+            out[i] = avpriv_scalarproduct_float_c(in0 + int_part,
+                                                  upsample_fir[4 - frac_part],
+                                                  UPS_MEM_SIZE);
             int_part++;
             frac_part--;
             i++;
@@ -856,8 +856,8 @@ static float find_hb_gain(AMRWBContext *ctx, const float *synth,
     if (ctx->fr_cur_mode == MODE_23k85)
         return qua_hb_gain[hb_idx] * (1.0f / (1 << 14));
 
-    tilt = ff_scalarproduct_float_c(synth, synth + 1, AMRWB_SFR_SIZE - 1) /
-           ff_scalarproduct_float_c(synth, synth, AMRWB_SFR_SIZE);
+    tilt = avpriv_scalarproduct_float_c(synth, synth + 1, AMRWB_SFR_SIZE - 1) /
+           avpriv_scalarproduct_float_c(synth, synth, AMRWB_SFR_SIZE);
 
     /* return gain bounded by [0.1, 1.0] */
     return av_clipf((1.0 - FFMAX(0.0, tilt)) * (1.25 - 0.25 * wsp), 0.1, 1.0);
@@ -876,7 +876,8 @@ static void scaled_hb_excitation(AMRWBContext *ctx, float *hb_exc,
                                  const float *synth_exc, float hb_gain)
 {
     int i;
-    float energy = ff_scalarproduct_float_c(synth_exc, synth_exc, AMRWB_SFR_SIZE);
+    float energy = avpriv_scalarproduct_float_c(synth_exc, synth_exc,
+                                                AMRWB_SFR_SIZE);
 
     /* Generate a white-noise excitation */
     for (i = 0; i < AMRWB_SFR_SIZE_16k; i++)
@@ -1168,9 +1169,9 @@ static int amrwb_decode_frame(AVCodecContext *avctx, void *data,
 
         ctx->fixed_gain[0] =
             ff_amr_set_fixed_gain(fixed_gain_factor,
-                                  ff_scalarproduct_float_c(ctx->fixed_vector,
-                                                           ctx->fixed_vector,
-                                                           AMRWB_SFR_SIZE) /
+                                  avpriv_scalarproduct_float_c(ctx->fixed_vector,
+                                                               ctx->fixed_vector,
+                                                               AMRWB_SFR_SIZE) /
                                   AMRWB_SFR_SIZE,
                        ctx->prediction_error,
                        ENERGY_MEAN, energy_pred_fac);
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 0e42158f19..f27aee4fb1 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -142,8 +142,6 @@ void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
 void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
 void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
 
-float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
-
 void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
                           int len);
 void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
@@ -293,7 +291,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
         c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
     }
 
-    c->scalarproduct_float        = ff_scalarproduct_float_neon;
     c->vector_clipf               = ff_vector_clipf_neon;
     c->vector_clip_int32          = ff_vector_clip_int32_neon;
 
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index a9b3a3d8b3..cf92817ba6 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -531,19 +531,6 @@ function ff_add_pixels_clamped_neon, export=1
         bx              lr
 endfunc
 
-function ff_scalarproduct_float_neon, export=1
-        vmov.f32        q2,  #0.0
-1:      vld1.32         {q0},[r0,:128]!
-        vld1.32         {q1},[r1,:128]!
-        vmla.f32        q2,  q0,  q1
-        subs            r2,  r2,  #4
-        bgt             1b
-        vadd.f32        d0,  d4,  d5
-        vpadd.f32       d0,  d0,  d0
-NOVFP   vmov.32         r0,  d0[0]
-        bx              lr
-endfunc
-
 function ff_vector_clipf_neon, export=1
 VFP     vdup.32         q1,  d0[1]
 VFP     vdup.32         q0,  d0[0]
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 8ce741a308..caf1b071d7 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2353,17 +2353,6 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
 
-float ff_scalarproduct_float_c(const float *v1, const float *v2, int len)
-{
-    float p = 0.0;
-    int i;
-
-    for (i = 0; i < len; i++)
-        p += v1[i] * v2[i];
-
-    return p;
-}
-
 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
                    uint32_t maxi, uint32_t maxisign)
 {
@@ -2694,7 +2683,6 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
     c->apply_window_int16 = apply_window_int16_c;
     c->vector_clip_int32 = vector_clip_int32_c;
-    c->scalarproduct_float = ff_scalarproduct_float_c;
 
     c->shrink[0]= av_image_copy_plane;
     c->shrink[1]= ff_shrink22;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 57afcdaaa8..9b88058345 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -342,13 +342,6 @@ typedef struct DSPContext {
 
     /* assume len is a multiple of 8, and arrays are 16-byte aligned */
     void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
-    /**
-     * Calculate the scalar product of two vectors of floats.
-     * @param v1  first vector, 16-byte aligned
-     * @param v2  second vector, 16-byte aligned
-     * @param len length of vectors, multiple of 4
-     */
-    float (*scalarproduct_float)(const float *v1, const float *v2, int len);
 
     /* (I)DCT */
     void (*fdct)(DCTELEM *block/* align 16*/);
@@ -454,17 +447,6 @@ void ff_dsputil_init(DSPContext* p, AVCodecContext *avctx);
 
 int ff_check_alignment(void);
 
-/**
- * Return the scalar product of two vectors.
- *
- * @param v1  first input vector
- * @param v2  first input vector
- * @param len number of elements
- *
- * @return sum of elementwise products
- */
-float ff_scalarproduct_float_c(const float *v1, const float *v2, int len);
-
 /**
  * permute block according to permuatation.
  * @param last last non zero element in scantable order
diff --git a/libavcodec/qcelpdec.c b/libavcodec/qcelpdec.c
index b702175c19..59220d53e3 100644
--- a/libavcodec/qcelpdec.c
+++ b/libavcodec/qcelpdec.c
@@ -30,10 +30,10 @@
 #include <stddef.h>
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/float_dsp.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
-#include "dsputil.h"
 #include "qcelpdata.h"
 #include "celp_filters.h"
 #include "acelp_filters.h"
@@ -400,12 +400,10 @@ static void apply_gain_ctrl(float *v_out, const float *v_ref, const float *v_in)
 {
     int i;
 
-    for (i = 0; i < 160; i += 40)
-        ff_scale_vector_to_given_sum_of_squares(v_out + i, v_in + i,
-                                                ff_scalarproduct_float_c(v_ref + i,
-                                                                         v_ref + i,
-                                                                         40),
-                                                40);
+    for (i = 0; i < 160; i += 40) {
+        float res = avpriv_scalarproduct_float_c(v_ref + i, v_ref + i, 40);
+        ff_scale_vector_to_given_sum_of_squares(v_out + i, v_in + i, res, 40);
+    }
 }
 
 /**
@@ -680,8 +678,9 @@ static void postfilter(QCELPContext *q, float *samples, float *lpc)
     ff_tilt_compensation(&q->postfilter_tilt_mem, 0.3, pole_out + 10, 160);
 
     ff_adaptive_gain_control(samples, pole_out + 10,
-                             ff_scalarproduct_float_c(q->formant_mem + 10,
-                                                      q->formant_mem + 10, 160),
+                             avpriv_scalarproduct_float_c(q->formant_mem + 10,
+                                                          q->formant_mem + 10,
+                                                          160),
                              160, 0.9375, &q->postfilter_agc_mem);
 }
 
diff --git a/libavcodec/ra288.c b/libavcodec/ra288.c
index 8266673aec..319bdd4e22 100644
--- a/libavcodec/ra288.c
+++ b/libavcodec/ra288.c
@@ -79,7 +79,7 @@ static av_cold int ra288_decode_init(AVCodecContext *avctx)
 static void convolve(float *tgt, const float *src, int len, int n)
 {
     for (; n >= 0; n--)
-        tgt[n] = ff_scalarproduct_float_c(src, src - n, len);
+        tgt[n] = avpriv_scalarproduct_float_c(src, src - n, len);
 
 }
 
@@ -108,7 +108,7 @@ static void decode(RA288Context *ractx, float gain, int cb_coef)
     for (i=0; i < 5; i++)
         buffer[i] = codetable[cb_coef][i] * sumsum;
 
-    sum = ff_scalarproduct_float_c(buffer, buffer, 5) * ((1 << 24) / 5.);
+    sum = avpriv_scalarproduct_float_c(buffer, buffer, 5) * ((1 << 24) / 5.);
 
     sum = FFMAX(sum, 1);
 
diff --git a/libavcodec/sipr.c b/libavcodec/sipr.c
index d482b0f068..3f3c13c6e1 100644
--- a/libavcodec/sipr.c
+++ b/libavcodec/sipr.c
@@ -26,11 +26,11 @@
 #include <string.h>
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/float_dsp.h"
 #include "libavutil/mathematics.h"
 #include "avcodec.h"
 #define BITSTREAM_READER_LE
 #include "get_bits.h"
-#include "dsputil.h"
 #include "internal.h"
 
 #include "lsp.h"
@@ -411,9 +411,10 @@ static void decode_frame(SiprContext *ctx, SiprParameters *params,
         convolute_with_sparse(fixed_vector, &fixed_cb, impulse_response,
                               SUBFR_SIZE);
 
-        avg_energy =
-            (0.01 + ff_scalarproduct_float_c(fixed_vector, fixed_vector, SUBFR_SIZE)) /
-                SUBFR_SIZE;
+        avg_energy = (0.01 + avpriv_scalarproduct_float_c(fixed_vector,
+                                                          fixed_vector,
+                                                          SUBFR_SIZE)) /
+                     SUBFR_SIZE;
 
         ctx->past_pitch_gain = pitch_gain = gain_cb[params->gc_index[i]][0];
 
@@ -454,9 +455,9 @@ static void decode_frame(SiprContext *ctx, SiprParameters *params,
 
     if (ctx->mode == MODE_5k0) {
         for (i = 0; i < subframe_count; i++) {
-            float energy = ff_scalarproduct_float_c(ctx->postfilter_syn5k0 + LP_FILTER_ORDER + i * SUBFR_SIZE,
-                                                    ctx->postfilter_syn5k0 + LP_FILTER_ORDER + i * SUBFR_SIZE,
-                                                    SUBFR_SIZE);
+            float energy = avpriv_scalarproduct_float_c(ctx->postfilter_syn5k0 + LP_FILTER_ORDER + i * SUBFR_SIZE,
+                                                        ctx->postfilter_syn5k0 + LP_FILTER_ORDER + i * SUBFR_SIZE,
+                                                        SUBFR_SIZE);
             ff_adaptive_gain_control(&synth[i * SUBFR_SIZE],
                                      &synth[i * SUBFR_SIZE], energy,
                                      SUBFR_SIZE, 0.9, &ctx->postfilter_agc);
diff --git a/libavcodec/sipr16k.c b/libavcodec/sipr16k.c
index bff739e44f..a472dfd59a 100644
--- a/libavcodec/sipr16k.c
+++ b/libavcodec/sipr16k.c
@@ -25,8 +25,8 @@
 
 #include "sipr.h"
 #include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
 #include "libavutil/mathematics.h"
-#include "dsputil.h"
 #include "lsp.h"
 #include "celp_filters.h"
 #include "acelp_vectors.h"
@@ -163,11 +163,11 @@ static float acelp_decode_gain_codef(float gain_corr_factor, const float *fc_v,
                                      const float *ma_prediction_coeff,
                                      int subframe_size, int ma_pred_order)
 {
-    mr_energy +=
-        ff_scalarproduct_float_c(quant_energy, ma_prediction_coeff, ma_pred_order);
+    mr_energy += avpriv_scalarproduct_float_c(quant_energy, ma_prediction_coeff,
+                                              ma_pred_order);
 
     mr_energy = gain_corr_factor * exp(M_LN10 / 20. * mr_energy) /
-        sqrt((0.01 + ff_scalarproduct_float_c(fc_v, fc_v, subframe_size)));
+        sqrt((0.01 + avpriv_scalarproduct_float_c(fc_v, fc_v, subframe_size)));
     return mr_energy;
 }
 
diff --git a/libavcodec/wmavoice.c b/libavcodec/wmavoice.c
index 08d0600200..ba778cda31 100644
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@@ -30,8 +30,8 @@
 #include <math.h>
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/float_dsp.h"
 #include "libavutil/mem.h"
-#include "dsputil.h"
 #include "avcodec.h"
 #include "internal.h"
 #include "get_bits.h"
@@ -523,7 +523,7 @@ static int kalman_smoothen(WMAVoiceContext *s, int pitch,
 
     /* find best fitting point in history */
     do {
-        dot = ff_scalarproduct_float_c(in, ptr, size);
+        dot = avpriv_scalarproduct_float_c(in, ptr, size);
         if (dot > optimal_gain) {
             optimal_gain  = dot;
             best_hist_ptr = ptr;
@@ -532,7 +532,7 @@ static int kalman_smoothen(WMAVoiceContext *s, int pitch,
 
     if (optimal_gain <= 0)
         return -1;
-    dot = ff_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
+    dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
     if (dot <= 0) // would be 1.0
         return -1;
 
@@ -562,8 +562,8 @@ static float tilt_factor(const float *lpcs, int n_lpcs)
 {
     float rh0, rh1;
 
-    rh0 = 1.0     + ff_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
-    rh1 = lpcs[0] + ff_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
+    rh0 = 1.0     + avpriv_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
+    rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
 
     return rh1 / rh0;
 }
@@ -656,7 +656,8 @@ static void calc_input_response(WMAVoiceContext *s, float *lpcs,
                              -1.8 * tilt_factor(coeffs, remainder - 1),
                              coeffs, remainder);
     }
-    sq = (1.0 / 64.0) * sqrtf(1 / ff_scalarproduct_float_c(coeffs, coeffs, remainder));
+    sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
+                                                               remainder));
     for (n = 0; n < remainder; n++)
         coeffs[n] *= sq;
 }
@@ -1320,7 +1321,8 @@ static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
     /* Calculate gain for adaptive & fixed codebook signal.
      * see ff_amr_set_fixed_gain(). */
     idx = get_bits(gb, 7);
-    fcb_gain = expf(ff_scalarproduct_float_c(s->gain_pred_err, gain_coeff, 6) -
+    fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
+                                                 gain_coeff, 6) -
                     5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
     acb_gain = wmavoice_gain_codebook_acb[idx];
     pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 27e77d565d..65f4b37d8f 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -463,32 +463,6 @@ cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP 0, 0
 
-
-; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
-INIT_XMM sse
-cglobal scalarproduct_float, 3,3,2, v1, v2, offset
-    neg offsetq
-    shl offsetq, 2
-    sub v1q, offsetq
-    sub v2q, offsetq
-    xorps xmm0, xmm0
-    .loop:
-        movaps   xmm1, [v1q+offsetq]
-        mulps    xmm1, [v2q+offsetq]
-        addps    xmm0, xmm1
-        add      offsetq, 16
-        js       .loop
-    movhlps xmm1, xmm0
-    addps   xmm0, xmm1
-    movss   xmm1, xmm0
-    shufps  xmm0, xmm0, 1
-    addss   xmm0, xmm1
-%if ARCH_X86_64 == 0
-    movss   r0m,  xmm0
-    fld     dword r0m
-%endif
-    RET
-
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
 ;                           int32_t max, unsigned int len)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 503764817a..65247c0016 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1846,8 +1846,6 @@ int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
                                       int w, int left);
 
-float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
-
 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
                                    int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
@@ -2128,10 +2126,6 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 
     c->vector_clipf = vector_clipf_sse;
 #endif /* HAVE_INLINE_ASM */
-
-#if HAVE_YASM
-    c->scalarproduct_float          = ff_scalarproduct_float_sse;
-#endif /* HAVE_YASM */
 }
 
 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c
index b3644e82a2..a7245ad92b 100644
--- a/libavutil/arm/float_dsp_init_neon.c
+++ b/libavutil/arm/float_dsp_init_neon.c
@@ -43,6 +43,8 @@ void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
 
 void ff_butterflies_float_neon(float *v1, float *v2, int len);
 
+float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
+
 void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
 {
     fdsp->vector_fmul = ff_vector_fmul_neon;
@@ -52,4 +54,5 @@ void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
     fdsp->vector_fmul_add    = ff_vector_fmul_add_neon;
     fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
     fdsp->butterflies_float = ff_butterflies_float_neon;
+    fdsp->scalarproduct_float = ff_scalarproduct_float_neon;
 }
diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S
index 4acc406d33..559b565628 100644
--- a/libavutil/arm/float_dsp_neon.S
+++ b/libavutil/arm/float_dsp_neon.S
@@ -256,3 +256,16 @@ function ff_butterflies_float_neon, export=1
         bgt             1b
         bx              lr
 endfunc
+
+function ff_scalarproduct_float_neon, export=1
+        vmov.f32        q2,  #0.0
+1:      vld1.32         {q0},[r0,:128]!
+        vld1.32         {q1},[r1,:128]!
+        vmla.f32        q2,  q0,  q1
+        subs            r2,  r2,  #4
+        bgt             1b
+        vadd.f32        d0,  d4,  d5
+        vpadd.f32       d0,  d0,  d0
+NOVFP   vmov.32         r0,  d0[0]
+        bx              lr
+endfunc
diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c
index c6e2b41d66..a40b029a29 100644
--- a/libavutil/float_dsp.c
+++ b/libavutil/float_dsp.c
@@ -101,6 +101,17 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2,
     }
 }
 
+float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len)
+{
+    float p = 0.0;
+    int i;
+
+    for (i = 0; i < len; i++)
+        p += v1[i] * v2[i];
+
+    return p;
+}
+
 void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact)
 {
     fdsp->vector_fmul = vector_fmul_c;
@@ -111,6 +122,7 @@ void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact)
     fdsp->vector_fmul_add = vector_fmul_add_c;
     fdsp->vector_fmul_reverse = vector_fmul_reverse_c;
     fdsp->butterflies_float = butterflies_float_c;
+    fdsp->scalarproduct_float = avpriv_scalarproduct_float_c;
 
 #if ARCH_ARM
     ff_float_dsp_init_arm(fdsp);
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
index ec57b36f79..f2b90a4848 100644
--- a/libavutil/float_dsp.h
+++ b/libavutil/float_dsp.h
@@ -146,8 +146,30 @@ typedef struct AVFloatDSPContext {
      * @param len length of vectors, multiple of 4
      */
     void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
+
+    /**
+     * Calculate the scalar product of two vectors of floats.
+     *
+     * @param v1  first vector, 16-byte aligned
+     * @param v2  second vector, 16-byte aligned
+     * @param len length of vectors, multiple of 4
+     *
+     * @return sum of elementwise products
+     */
+    float (*scalarproduct_float)(const float *v1, const float *v2, int len);
 } AVFloatDSPContext;
 
+/**
+ * Return the scalar product of two vectors.
+ *
+ * @param v1  first input vector
+ * @param v2  first input vector
+ * @param len number of elements
+ *
+ * @return sum of elementwise products
+ */
+float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len);
+
 /**
  * Initialize a float DSP context.
  *
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 126f3495c4..779339c575 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -227,3 +227,28 @@ INIT_XMM sse
 VECTOR_FMUL_REVERSE
 INIT_YMM avx
 VECTOR_FMUL_REVERSE
+
+; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
+INIT_XMM sse
+cglobal scalarproduct_float, 3,3,2, v1, v2, offset
+    neg   offsetq
+    shl   offsetq, 2
+    sub       v1q, offsetq
+    sub       v2q, offsetq
+    xorps    xmm0, xmm0
+.loop:
+    movaps   xmm1, [v1q+offsetq]
+    mulps    xmm1, [v2q+offsetq]
+    addps    xmm0, xmm1
+    add   offsetq, 16
+    js .loop
+    movhlps  xmm1, xmm0
+    addps    xmm0, xmm1
+    movss    xmm1, xmm0
+    shufps   xmm0, xmm0, 1
+    addss    xmm0, xmm1
+%if ARCH_X86_64 == 0
+    movss     r0m,  xmm0
+    fld dword r0m
+%endif
+    RET
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index 9f63b4c057..81c9a7d468 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -51,6 +51,8 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
                                 const float *src1, int len);
 
+float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+
 #if HAVE_6REGS && HAVE_INLINE_ASM
 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
                                         const float *src1, const float *win,
@@ -135,6 +137,7 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
         fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
         fdsp->vector_fmul_add    = ff_vector_fmul_add_sse;
         fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
+        fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
     }
     if (EXTERNAL_SSE2(mm_flags)) {
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;

From 73b704ac609d83e0be124589f24efd9b94947cf9 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 21 Jan 2013 10:16:02 +0100
Subject: [PATCH 2/2] arm: Add some missing header #includes

---
 libavcodec/arm/h264pred_init_arm.c | 1 +
 libavcodec/arm/vp3dsp_init_arm.c   | 1 +
 libavcodec/arm/vp8dsp_init_arm.c   | 1 +
 libavcodec/arm/vp8dsp_init_armv6.c | 2 ++
 libavcodec/arm/vp8dsp_init_neon.c  | 2 ++
 5 files changed, 7 insertions(+)

diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c
index 39c012127d..0431fc8691 100644
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@@ -21,6 +21,7 @@
 #include <stdint.h>
 
 #include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
 #include "libavcodec/h264pred.h"
 
 void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index ea99bfd2b3..e9f3fd3f17 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -21,6 +21,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
+#include "libavcodec/dsputil.h"
 #include "libavcodec/vp3dsp.h"
 
 void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
index 603f68cd24..b7897cd9fd 100644
--- a/libavcodec/arm/vp8dsp_init_arm.c
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -18,6 +18,7 @@
 
 #include <stdint.h>
 
+#include "libavutil/attributes.h"
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/vp8dsp.h"
 #include "vp8dsp.h"
diff --git a/libavcodec/arm/vp8dsp_init_armv6.c b/libavcodec/arm/vp8dsp_init_armv6.c
index 85a803af83..c3d024f5cb 100644
--- a/libavcodec/arm/vp8dsp_init_armv6.c
+++ b/libavcodec/arm/vp8dsp_init_armv6.c
@@ -17,6 +17,8 @@
  */
 
 #include <stdint.h>
+
+#include "libavutil/attributes.h"
 #include "libavcodec/vp8dsp.h"
 #include "vp8dsp.h"
 
diff --git a/libavcodec/arm/vp8dsp_init_neon.c b/libavcodec/arm/vp8dsp_init_neon.c
index dbe5b9f961..965243c3e4 100644
--- a/libavcodec/arm/vp8dsp_init_neon.c
+++ b/libavcodec/arm/vp8dsp_init_neon.c
@@ -17,6 +17,8 @@
  */
 
 #include <stdint.h>
+
+#include "libavutil/attributes.h"
 #include "libavcodec/vp8dsp.h"
 #include "vp8dsp.h"