diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 398326c8c8..65db20d2b3 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -154,8 +154,6 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0,
                                 const float *src1, const float *win, int len);
 void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
                                 int len);
-void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
-                                int len);
 void ff_butterflies_float_neon(float *v1, float *v2, int len);
 float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
 void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
@@ -329,7 +327,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 
     c->vector_fmul_window         = ff_vector_fmul_window_neon;
     c->vector_fmul_scalar         = ff_vector_fmul_scalar_neon;
-    c->vector_fmac_scalar         = ff_vector_fmac_scalar_neon;
     c->butterflies_float          = ff_butterflies_float_neon;
     c->scalarproduct_float        = ff_scalarproduct_float_neon;
     c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 9a5a40d6ac..358ed61299 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -682,54 +682,6 @@ NOVFP   vdup.32         q8,  r2
         .unreq          len
 endfunc
 
-function ff_vector_fmac_scalar_neon, export=1
-VFP     len .req r2
-VFP     acc .req r3
-NOVFP   len .req r3
-NOVFP   acc .req r2
-VFP     vdup.32         q15, d0[0]
-NOVFP   vdup.32         q15, r2
-        bics            r12, len, #15
-        mov             acc, r0
-        beq             3f
-        vld1.32         {q0},     [r1,:128]!
-        vld1.32         {q8},     [acc,:128]!
-        vld1.32         {q1},     [r1,:128]!
-        vld1.32         {q9},     [acc,:128]!
-1:      vmla.f32        q8,  q0,  q15
-        vld1.32         {q2},     [r1,:128]!
-        vld1.32         {q10},    [acc,:128]!
-        vmla.f32        q9,  q1,  q15
-        vld1.32         {q3},     [r1,:128]!
-        vld1.32         {q11},    [acc,:128]!
-        vmla.f32        q10, q2,  q15
-        vst1.32         {q8},     [r0,:128]!
-        vmla.f32        q11, q3,  q15
-        vst1.32         {q9},     [r0,:128]!
-        subs            r12, r12, #16
-        beq             2f
-        vld1.32         {q0},     [r1,:128]!
-        vld1.32         {q8},     [acc,:128]!
-        vst1.32         {q10},    [r0,:128]!
-        vld1.32         {q1},     [r1,:128]!
-        vld1.32         {q9},     [acc,:128]!
-        vst1.32         {q11},    [r0,:128]!
-        b               1b
-2:      vst1.32         {q10},    [r0,:128]!
-        vst1.32         {q11},    [r0,:128]!
-        ands            len, len, #15
-        it              eq
-        bxeq            lr
-3:      vld1.32         {q0},     [r1,:128]!
-        vld1.32         {q8},     [acc,:128]!
-        vmla.f32        q8,  q0,  q15
-        vst1.32         {q8},     [r0,:128]!
-        subs            len, len, #4
-        bgt             3b
-        bx              lr
-        .unreq          len
-endfunc
-
 function ff_butterflies_float_neon, export=1
 1:      vld1.32         {q0},[r0,:128]
         vld1.32         {q1},[r1,:128]
diff --git a/libavcodec/dca.c b/libavcodec/dca.c
index 103f0588e3..b37dc49d3f 100644
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -27,6 +27,7 @@
 #include <stdio.h>
 
 #include "libavutil/common.h"
+#include "libavutil/float_dsp.h"
 #include "libavutil/intmath.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
@@ -383,7 +384,7 @@ typedef struct {
     int profile;
 
     int debug_flag;             ///< used for suppressing repeated error messages output
-    DSPContext dsp;
+    AVFloatDSPContext fdsp;
     FFTContext imdct;
     SynthFilterContext synth;
     DCADSPContext dcadsp;
@@ -1865,8 +1866,8 @@ static int dca_decode_frame(AVCodecContext *avctx, void *data,
             float *back_chan = s->samples + s->channel_order_tab[s->xch_base_channel]     * 256;
             float *lt_chan   = s->samples + s->channel_order_tab[s->xch_base_channel - 2] * 256;
             float *rt_chan   = s->samples + s->channel_order_tab[s->xch_base_channel - 1] * 256;
-            s->dsp.vector_fmac_scalar(lt_chan, back_chan, -M_SQRT1_2, 256);
-            s->dsp.vector_fmac_scalar(rt_chan, back_chan, -M_SQRT1_2, 256);
+            s->fdsp.vector_fmac_scalar(lt_chan, back_chan, -M_SQRT1_2, 256);
+            s->fdsp.vector_fmac_scalar(rt_chan, back_chan, -M_SQRT1_2, 256);
         }
 
         if (avctx->sample_fmt == AV_SAMPLE_FMT_FLT) {
@@ -1908,7 +1909,7 @@ static av_cold int dca_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     dca_init_vlcs();
 
-    ff_dsputil_init(&s->dsp, avctx);
+    avpriv_float_dsp_init(&s->fdsp, avctx->flags & CODEC_FLAG_BITEXACT);
     ff_mdct_init(&s->imdct, 6, 1, 1.0);
     ff_synth_filter_init(&s->synth);
     ff_dcadsp_init(&s->dcadsp);
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 942f606ea8..15f184e406 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2401,14 +2401,6 @@ static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
         dst[i] = src[i] * mul;
 }
 
-static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
-                                 int len)
-{
-    int i;
-    for (i = 0; i < len; i++)
-        dst[i] += src[i] * mul;
-}
-
 static void butterflies_float_c(float *restrict v1, float *restrict v2,
                                 int len)
 {
@@ -2904,7 +2896,6 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->butterflies_float = butterflies_float_c;
     c->butterflies_float_interleave = butterflies_float_interleave_c;
     c->vector_fmul_scalar = vector_fmul_scalar_c;
-    c->vector_fmac_scalar = vector_fmac_scalar_c;
 
     c->shrink[0]= av_image_copy_plane;
     c->shrink[1]= ff_shrink22;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index ec3d7ee007..e54ae69831 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -416,17 +416,6 @@ typedef struct DSPContext {
      */
     void (*vector_fmul_scalar)(float *dst, const float *src, float mul,
                                int len);
-    /**
-     * Multiply a vector of floats by a scalar float and add to
-     * destination vector.  Source and destination vectors must
-     * overlap exactly or not at all.
-     * @param dst result vector, 16-byte aligned
-     * @param src input vector, 16-byte aligned
-     * @param mul scalar value
-     * @param len length of vector, multiple of 4
-     */
-    void (*vector_fmac_scalar)(float *dst, const float *src, float mul,
-                               int len);
     /**
      * Calculate the scalar product of two vectors of floats.
      * @param v1  first vector, 16-byte aligned
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c
index fa6d0d7d15..3ca0288b31 100644
--- a/libavutil/arm/float_dsp_init_neon.c
+++ b/libavutil/arm/float_dsp_init_neon.c
@@ -26,7 +26,11 @@
 
 void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len);
 
+void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
+                                int len);
+
 void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
 {
     fdsp->vector_fmul = ff_vector_fmul_neon;
+    fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_neon;
 }
diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S
index d66fa09424..03b164388f 100644
--- a/libavutil/arm/float_dsp_neon.S
+++ b/libavutil/arm/float_dsp_neon.S
@@ -62,3 +62,51 @@ function ff_vector_fmul_neon, export=1
 3:      vst1.32         {d16-d19},[r0,:128]!
         bx              lr
 endfunc
+
+function ff_vector_fmac_scalar_neon, export=1
+VFP     len .req r2
+VFP     acc .req r3
+NOVFP   len .req r3
+NOVFP   acc .req r2
+VFP     vdup.32         q15, d0[0]
+NOVFP   vdup.32         q15, r2
+        bics            r12, len, #15
+        mov             acc, r0
+        beq             3f
+        vld1.32         {q0},     [r1,:128]!
+        vld1.32         {q8},     [acc,:128]!
+        vld1.32         {q1},     [r1,:128]!
+        vld1.32         {q9},     [acc,:128]!
+1:      vmla.f32        q8,  q0,  q15
+        vld1.32         {q2},     [r1,:128]!
+        vld1.32         {q10},    [acc,:128]!
+        vmla.f32        q9,  q1,  q15
+        vld1.32         {q3},     [r1,:128]!
+        vld1.32         {q11},    [acc,:128]!
+        vmla.f32        q10, q2,  q15
+        vst1.32         {q8},     [r0,:128]!
+        vmla.f32        q11, q3,  q15
+        vst1.32         {q9},     [r0,:128]!
+        subs            r12, r12, #16
+        beq             2f
+        vld1.32         {q0},     [r1,:128]!
+        vld1.32         {q8},     [acc,:128]!
+        vst1.32         {q10},    [r0,:128]!
+        vld1.32         {q1},     [r1,:128]!
+        vld1.32         {q9},     [acc,:128]!
+        vst1.32         {q11},    [r0,:128]!
+        b               1b
+2:      vst1.32         {q10},    [r0,:128]!
+        vst1.32         {q11},    [r0,:128]!
+        ands            len, len, #15
+        it              eq
+        bxeq            lr
+3:      vld1.32         {q0},     [r1,:128]!
+        vld1.32         {q8},     [acc,:128]!
+        vmla.f32        q8,  q0,  q15
+        vst1.32         {q8},     [r0,:128]!
+        subs            len, len, #4
+        bgt             3b
+        bx              lr
+        .unreq          len
+endfunc
diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c
index 039dd07d36..2e90939090 100644
--- a/libavutil/float_dsp.c
+++ b/libavutil/float_dsp.c
@@ -28,9 +28,18 @@ static void vector_fmul_c(float *dst, const float *src0, const float *src1,
         dst[i] = src0[i] * src1[i];
 }
 
+static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
+                                 int len)
+{
+    int i;
+    for (i = 0; i < len; i++)
+        dst[i] += src[i] * mul;
+}
+
 void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact)
 {
     fdsp->vector_fmul = vector_fmul_c;
+    fdsp->vector_fmac_scalar = vector_fmac_scalar_c;
 
 #if ARCH_ARM
     ff_float_dsp_init_arm(fdsp);
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
index 30161a252b..4e266304da 100644
--- a/libavutil/float_dsp.h
+++ b/libavutil/float_dsp.h
@@ -35,6 +35,22 @@ typedef struct AVFloatDSPContext {
      */
     void (*vector_fmul)(float *dst, const float *src0, const float *src1,
                         int len);
+
+    /**
+     * Multiply a vector of floats by a scalar float and add to
+     * destination vector.  Source and destination vectors must
+     * overlap exactly or not at all.
+     *
+     * @param dst result vector
+     *            constraints: 16-byte aligned
+     * @param src input vector
+     *            constraints: 16-byte aligned
+     * @param mul scalar value
+     * @param len length of vector
+     *            constraints: multiple of 4
+     */
+    void (*vector_fmac_scalar)(float *dst, const float *src, float mul,
+                               int len);
 } AVFloatDSPContext;
 
 /**