From af94ab7c7c004786084903bcf82b7617e88e3aa9 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Fri, 21 Jan 2022 07:50:53 +0100
Subject: [PATCH] lavu/tx: add an RDFT implementation

RDFTs are full of conventions that vary between implementations.
What I've gone for here is what's most common between
both fftw, avcodec's rdft and what we use, the equivalent of
which is DFT_R2C for forward and IDFT_C2R for inverse. The
other 2 conventions (IDFT_R2C and DFT_C2R) were not used at
all in our code, and their names are also not appropriate.
If there's a use for either, we can easily add a flag which
would just flip the sign on one exptab.

For some unknown reason, possibly to allow reusing FFT's exp tables,
av_rdft's C2R output is 0.5x lower than what it should be to ensure
a proper back-and-forth conversion.
This code outputs its real samples at the correct level, which
matches FFTW's level, and allows the user to change the level
and insert arbitrary multiplies for free by setting the scale option.
---
 libavutil/tx.c          |   5 +-
 libavutil/tx.h          |  20 +++++++
 libavutil/tx_template.c | 130 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+), 1 deletion(-)

diff --git a/libavutil/tx.c b/libavutil/tx.c
index c34d3d94fe..37785b33df 100644
--- a/libavutil/tx.c
+++ b/libavutil/tx.c
@@ -262,7 +262,7 @@ static av_cold int ff_tx_null_init(AVTXContext *s, const FFTXCodelet *cd,
                                    int len, int inv, const void *scale)
 {
     /* Can only handle one sample+type to one sample+type transforms */
-    if (TYPE_IS(MDCT, s->type))
+    if (TYPE_IS(MDCT, s->type) || TYPE_IS(RDFT, s->type))
         return AVERROR(EINVAL);
     return 0;
 }
@@ -322,10 +322,13 @@ static void print_type(AVBPrint *bp, enum AVTXType type)
                type == TX_TYPE_ANY       ? "any"         :
                type == AV_TX_FLOAT_FFT   ? "fft_float"   :
                type == AV_TX_FLOAT_MDCT  ? "mdct_float"  :
+               type == AV_TX_FLOAT_RDFT  ? "rdft_float"  :
                type == AV_TX_DOUBLE_FFT  ? "fft_double"  :
                type == AV_TX_DOUBLE_MDCT ? "mdct_double" :
+               type == AV_TX_DOUBLE_RDFT ? "rdft_double" :
                type == AV_TX_INT32_FFT   ? "fft_int32"   :
                type == AV_TX_INT32_MDCT  ? "mdct_int32"  :
+               type == AV_TX_INT32_RDFT  ? "rdft_int32"  :
                "unknown");
 }
 
diff --git a/libavutil/tx.h b/libavutil/tx.h
index 0e57874376..087355f10d 100644
--- a/libavutil/tx.h
+++ b/libavutil/tx.h
@@ -69,6 +69,26 @@ enum AVTXType {
     AV_TX_DOUBLE_MDCT = 3,
     AV_TX_INT32_MDCT  = 5,
 
+    /**
+     * Real to complex and complex to real DFTs.
+     * For the float and int32 variants, the scale type is 'float', while for
+     * the double variant, it's a 'double'. If scale is NULL, 1.0 will be used
+     * as a default.
+     *
+     * The stride parameter must be set to the size of a single sample in bytes.
+     *
+     * The forward transform performs a real-to-complex DFT of N samples to
+     * N/2+1 complex values.
+     *
+     * The inverse transform performs a complex-to-real DFT of N/2+1 complex
+     * values to N real samples. The output is not normalized, but can be
+     * made so by setting the scale value to 1.0/len.
+     * NOTE: the inverse transform always overwrites the input.
+     */
+    AV_TX_FLOAT_RDFT  = 6,
+    AV_TX_DOUBLE_RDFT = 7,
+    AV_TX_INT32_RDFT  = 8,
+
     /* Not part of the API, do not use */
     AV_TX_NB,
 };
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index cee8458970..1e4354580b 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -1277,6 +1277,134 @@ DECL_COMP_MDCT(7)
 DECL_COMP_MDCT(9)
 DECL_COMP_MDCT(15)
 
+static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s,
+                                            const FFTXCodelet *cd,
+                                            uint64_t flags,
+                                            FFTXCodeletOptions *opts,
+                                            int len, int inv,
+                                            const void *scale)
+{
+    int ret;
+    double f, m;
+    TXSample *tab;
+
+    s->scale_d = *((SCALE_TYPE *)scale);
+    s->scale_f = s->scale_d;
+
+    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
+        return ret;
+
+    if (!(s->exp = av_mallocz((8 + (len >> 2) - 1)*sizeof(*s->exp))))
+        return AVERROR(ENOMEM);
+
+    tab = (TXSample *)s->exp;
+
+    f = 2*M_PI/len;
+
+    m = (inv ? 2*s->scale_d : s->scale_d);
+
+    *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
+    *tab++ = RESCALE(inv ? 0.5*m : 1.0);
+    *tab++ = RESCALE( m);
+    *tab++ = RESCALE(-m);
+
+    *tab++ = RESCALE( (0.5 - 0.0) * m);
+    *tab++ = RESCALE( (0.0 - 0.5) * m);
+    *tab++ = RESCALE( (0.5 - inv) * m);
+    *tab++ = RESCALE(-(0.5 - inv) * m);
+
+    for (int i = 0; i < len >> 2; i++)
+        *tab++ = RESCALE(cos(i*f));
+    for (int i = len >> 2; i >= 0; i--)
+        *tab++ = RESCALE(cos(i*f) * (inv ? +1.0 : -1.0));
+
+    return 0;
+}
+
+#define DECL_RDFT(name, inv)                                                   \
+static void TX_NAME(ff_tx_rdft_ ##name)(AVTXContext *s, void *_dst,            \
+                                       void *_src, ptrdiff_t stride)           \
+{                                                                              \
+    const int len2 = s->len >> 1;                                              \
+    const int len4 = s->len >> 2;                                              \
+    const TXSample *fact = (void *)s->exp;                                     \
+    const TXSample *tcos = fact + 8;                                           \
+    const TXSample *tsin = tcos + len4;                                        \
+    TXComplex *data = inv ? _src : _dst;                                       \
+    TXComplex t[3];                                                            \
+                                                                               \
+    if (!inv)                                                                  \
+        s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex));                   \
+    else                                                                       \
+        data[0].im = data[len2].re;                                            \
+                                                                               \
+    /* The DC value's both components are real, but we need to change them     \
+     * into complex values. Also, the middle of the array is special-cased.    \
+     * These operations can be done before or after the loop. */               \
+    t[0].re = data[0].re;                                                      \
+    data[0].re = t[0].re + data[0].im;                                         \
+    data[0].im = t[0].re - data[0].im;                                         \
+    data[   0].re = MULT(fact[0], data[   0].re);                              \
+    data[   0].im = MULT(fact[1], data[   0].im);                              \
+    data[len4].re = MULT(fact[2], data[len4].re);                              \
+    data[len4].im = MULT(fact[3], data[len4].im);                              \
+                                                                               \
+    for (int i = 1; i < len4; i++) {                                           \
+        /* Separate even and odd FFTs */                                       \
+        t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re));             \
+        t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im));             \
+        t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im));             \
+        t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re));             \
+                                                                               \
+        /* Apply twiddle factors to the odd FFT and add to the even FFT */     \
+        CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]);            \
+                                                                               \
+        data[       i].re = t[0].re + t[2].re;                                 \
+        data[       i].im = t[2].im - t[0].im;                                 \
+        data[len2 - i].re = t[0].re - t[2].re;                                 \
+        data[len2 - i].im = t[2].im + t[0].im;                                 \
+    }                                                                          \
+                                                                               \
+    if (inv) {                                                                 \
+        s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex));                   \
+    } else {                                                                   \
+        /* Move [0].im to the last position, as convention requires */         \
+        data[len2].re = data[0].im;                                            \
+        data[   0].im = 0;                                                     \
+    }                                                                          \
+}
+
+DECL_RDFT(r2c, 0)
+DECL_RDFT(c2r, 1)
+
+static const FFTXCodelet TX_NAME(ff_tx_rdft_r2c_def) = {
+    .name       = TX_NAME_STR("rdft_r2c"),
+    .function   = TX_NAME(ff_tx_rdft_r2c),
+    .type       = TX_TYPE(RDFT),
+    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
+                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
+    .factors    = { 2, TX_FACTOR_ANY },
+    .min_len    = 2,
+    .max_len    = TX_LEN_UNLIMITED,
+    .init       = TX_NAME(ff_tx_rdft_init),
+    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
+    .prio       = FF_TX_PRIO_BASE,
+};
+
+static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = {
+    .name       = TX_NAME_STR("rdft_c2r"),
+    .function   = TX_NAME(ff_tx_rdft_c2r),
+    .type       = TX_TYPE(RDFT),
+    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
+                  FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
+    .factors    = { 2, TX_FACTOR_ANY },
+    .min_len    = 2,
+    .max_len    = TX_LEN_UNLIMITED,
+    .init       = TX_NAME(ff_tx_rdft_init),
+    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
+    .prio       = FF_TX_PRIO_BASE,
+};
+
 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s)
 {
     int len4 = s->len >> 1;
@@ -1340,6 +1468,8 @@ const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
     &TX_NAME(ff_tx_mdct_naive_fwd_def),
     &TX_NAME(ff_tx_mdct_naive_inv_def),
     &TX_NAME(ff_tx_mdct_inv_full_def),
+    &TX_NAME(ff_tx_rdft_r2c_def),
+    &TX_NAME(ff_tx_rdft_c2r_def),
 
     NULL,
 };