lavu/tx: refactor to explicitly track and convert lookup table order

Necessary for generalizing PFAs.
pull/388/head
Lynne 2 years ago
parent 1c8d77a2bf
commit 87bae6b018
No known key found for this signature in database
GPG Key ID: A2FEA5F03F034464
  1. 5
      libavutil/aarch64/tx_float_init.c
  2. 109
      libavutil/tx.c
  3. 52
      libavutil/tx_priv.h
  4. 49
      libavutil/tx_template.c
  5. 46
      libavutil/x86/tx_float_init.c

@ -37,12 +37,11 @@ static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd,
uint64_t flags, FFTXCodeletOptions *opts,
int len, int inv, const void *scale)
{
const int inv_lookup = opts ? opts->invert_lookup : 1;
ff_tx_init_tabs_float(len);
if (cd->max_len == 2)
return ff_tx_gen_ptwo_revtab(s, inv_lookup);
return ff_tx_gen_ptwo_revtab(s, opts);
else
return ff_tx_gen_split_radix_parity_revtab(s, len, inv, inv_lookup, 8, 0);
return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, 8, 0);
}
const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = {

@ -39,11 +39,41 @@ static av_always_inline int mulinv(int n, int m)
return 0;
}
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts,
int d1, int d2)
{
const int sl = d1*d2;
s->map = av_malloc(s->len*sizeof(*s->map));
if (!s->map)
return AVERROR(ENOMEM);
for (int k = 0; k < s->len; k += sl) {
if (s->inv || (opts && opts->map_dir == FF_TX_MAP_SCATTER)) {
for (int m = 0; m < d2; m++)
for (int n = 0; n < d1; n++)
s->map[k + ((m*d1 + n*d2) % (sl))] = m*d1 + n;
} else {
for (int m = 0; m < d2; m++)
for (int n = 0; n < d1; n++)
s->map[k + m*d1 + n] = (m*d1 + n*d2) % (sl);
}
if (s->inv)
for (int w = 1; w <= ((sl) >> 1); w++)
FFSWAP(int, s->map[k + w], s->map[k + sl - w]);
}
s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
return 0;
}
/* Guaranteed to work for any n, m where gcd(n, m) == 1 */
int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts,
int inv, int n, int m)
{
int *in_map, *out_map;
const int inv = s->inv;
const int len = n*m; /* Will not be equal to s->len for MDCTs */
int m_inv, n_inv;
@ -61,14 +91,22 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
out_map = s->map + len;
/* Ruritanian map for input, CRT map for output, can be swapped */
for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
in_map[j*n + i] = (i*m + j*n) % len;
out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
if (opts && opts->map_dir == FF_TX_MAP_SCATTER) {
for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
in_map[(i*m + j*n) % len] = j*n + i;
out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
}
}
} else {
for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
in_map[j*n + i] = (i*m + j*n) % len;
out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
}
}
}
/* Change transform direction by reversing all ACs */
if (inv) {
for (int i = 0; i < m; i++) {
int *in = &in_map[i*n + 1]; /* Skip the DC */
@ -77,17 +115,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
}
}
/* Our 15-point transform is also a compound one, so embed its input map */
if (n == 15) {
for (int k = 0; k < m; k++) {
int tmp[15];
memcpy(tmp, &in_map[k*15], 15*sizeof(*tmp));
for (int i = 0; i < 5; i++) {
for (int j = 0; j < 3; j++)
in_map[k*15 + i*3 + j] = tmp[(i*3 + j*5) % 15];
}
}
}
s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
return 0;
}
@ -103,21 +131,23 @@ static inline int split_radix_permutation(int i, int len, int inv)
return split_radix_permutation(i, len, inv) * 4 + 1 - 2*(!(i & len) ^ inv);
}
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup)
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
{
int len = s->len;
if (!(s->map = av_malloc(len*sizeof(*s->map))))
return AVERROR(ENOMEM);
if (invert_lookup) {
if (opts && opts->map_dir == FF_TX_MAP_SCATTER) {
for (int i = 0; i < s->len; i++)
s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1);
s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i;
} else {
for (int i = 0; i < s->len; i++)
s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i;
s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1);
}
s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
return 0;
}
@ -207,7 +237,8 @@ static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
}
int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv,
int inv_lookup, int basis, int dual_stride)
FFTXCodeletOptions *opts,
int basis, int dual_stride)
{
basis >>= 1;
if (len < basis)
@ -220,7 +251,10 @@ int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv,
av_assert0(dual_stride <= basis);
parity_revtab_generator(s->map, len, inv, 0, 0, 0, len,
basis, dual_stride, inv_lookup != 0);
basis, dual_stride,
opts ? opts->map_dir == FF_TX_MAP_GATHER : FF_TX_MAP_GATHER);
s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
return 0;
}
@ -656,6 +690,33 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
ret = cd->init(sctx, cd, flags, opts, len, inv, scale);
if (ret >= 0) {
if (opts && opts->map_dir != FF_TX_MAP_NONE &&
sctx->map_dir == FF_TX_MAP_NONE) {
/* If a specific map direction was requested, and it doesn't
* exist, create one.*/
sctx->map = av_malloc(len*sizeof(*sctx->map));
if (!sctx->map) {
ret = AVERROR(ENOMEM);
goto end;
}
for (int i = 0; i < len; i++)
sctx->map[i] = i;
} else if (opts && (opts->map_dir != sctx->map_dir)) {
int *tmp = av_malloc(len*sizeof(*sctx->map));
if (!tmp) {
ret = AVERROR(ENOMEM);
goto end;
}
memcpy(tmp, sctx->map, len*sizeof(*sctx->map));
for (int i = 0; i < len; i++)
sctx->map[tmp[i]] = i;
free(tmp);
}
s->nb_sub++;
goto end;
}

@ -158,10 +158,23 @@ typedef enum FFTXCodeletPriority {
FF_TX_PRIO_MAX = 32768, /* For custom implementations/ASICs */
} FFTXCodeletPriority;
typedef enum FFTXMapDirection {
/* No map. Make a map up. */
FF_TX_MAP_NONE = 0,
/* Lookup table must be applied via dst[i] = src[lut[i]]; */
FF_TX_MAP_GATHER,
/* Lookup table must be applied via dst[lut[i]] = src[i]; */
FF_TX_MAP_SCATTER,
} FFTXMapDirection;
/* Codelet options */
typedef struct FFTXCodeletOptions {
int invert_lookup; /* If codelet is flagged as FF_TX_CODELET_PRESHUFFLE,
invert the lookup direction for the map generated */
/* Request a specific lookup table direction. Codelets MUST put the
* direction in AVTXContext. If the codelet does not respect this, a
* conversion will be performed. */
FFTXMapDirection map_dir;
} FFTXCodeletOptions;
/* Maximum number of factors a codelet may have. Arbitrary. */
@ -234,11 +247,32 @@ struct AVTXContext {
enum AVTXType type; /* Type of transform */
uint64_t flags; /* A combination of AVTXFlags and
* codelet flags used when creating */
FFTXMapDirection map_dir; /* Direction of AVTXContext->map */
float scale_f;
double scale_d;
void *opaque; /* Free to use by implementations */
};
/* This function embeds a Ruritanian PFA input map into an existing lookup table
* to avoid double permutation. This allows for compound factors to be
* synthesized as fast PFA FFTs and embedded into either other or standalone
* transforms.
* The output CRT map must still be pre-baked into the transform. */
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2) \
do { \
int mtmp[(d1)*(d2)]; \
for (int k = 0; k < tot_len; k += (d1)*(d2)) { \
memcpy(mtmp, &map[k], (d1)*(d2)*sizeof(*mtmp)); \
for (int m = 0; m < (d2); m++) \
for (int n = 0; n < (d1); n++) \
map[k + m*(d1) + n] = mtmp[(m*(d1) + n*(d2)) % ((d1)*(d2))]; \
} \
} while (0)
/* This function generates a Ruritanian PFA input map into s->map. */
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts,
int d1, int d2);
/* Create a subtransform in the current context with the given parameters.
* The flags parameter from FFTXCodelet.init() should be preserved as much
* as that's possible.
@ -250,11 +284,18 @@ int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
/* Clear the context by freeing all tables, maps and subtransforms. */
void ff_tx_clear_ctx(AVTXContext *s);
/* Generate a default map (0->len or 0, (len-1)->1 for inverse transforms)
* for a context. */
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts);
/*
* Generates the PFA permutation table into AVTXContext->pfatab. The end table
* is appended to the start table.
* The `inv` flag should only be enabled if the lookup tables of subtransforms
* won't get flattened.
*/
int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m);
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts,
int inv, int n, int m);
/*
* Generates a standard-ish (slightly modified) Split-Radix revtab into
@ -262,7 +303,7 @@ int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m);
* If it's set to 0, it has to be applied like out[map[i]] = in[i], otherwise
* if it's set to 1, has to be applied as out[i] = in[map[i]]
*/
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup);
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts);
/*
* Generates an index into AVTXContext->inplace_idx that if followed in the
@ -303,7 +344,8 @@ int ff_tx_gen_inplace_map(AVTXContext *s, int len);
* to out[i] = src[map[i]].
*/
int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv,
int inv_lookup, int basis, int dual_stride);
FFTXCodeletOptions *opts,
int basis, int dual_stride);
/* Typed init function to initialize shared tables. Will initialize all tables
* for all factors of a length. */

@ -479,30 +479,15 @@ static av_cold int TX_NAME(ff_tx_fft_factor_init)(AVTXContext *s,
int len, int inv,
const void *scale)
{
int ret = 0;
TX_TAB(ff_tx_init_tabs)(len);
if (flags & FF_TX_PRESHUFFLE) {
s->map = av_malloc(len*sizeof(s->map));
s->map[0] = 0; /* DC is always at the start */
if (inv) /* Reversing the ACs flips the transform direction */
for (int i = 1; i < len; i++)
s->map[i] = len - i;
else
for (int i = 1; i < len; i++)
s->map[i] = i;
}
/* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */
if (len == 15) {
int tmp[15];
memcpy(tmp, s->map, 15*sizeof(*tmp));
for (int i = 0; i < 5; i++) {
for (int j = 0; j < 3; j++)
s->map[i*3 + j] = tmp[(i*3 + j*5) % 15];
}
}
if (len == 15)
ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
else if (flags & FF_TX_PRESHUFFLE)
ret = ff_tx_gen_default_map(s, opts);
return 0;
return ret;
}
#define DECL_FACTOR_S(n) \
@ -605,7 +590,7 @@ static av_cold int TX_NAME(ff_tx_fft_sr_codelet_init)(AVTXContext *s,
const void *scale)
{
TX_TAB(ff_tx_init_tabs)(len);
return ff_tx_gen_ptwo_revtab(s, opts ? opts->invert_lookup : 1);
return ff_tx_gen_ptwo_revtab(s, opts);
}
#define DECL_SR_CODELET_DEF(n) \
@ -742,7 +727,9 @@ static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s,
{
int ret;
int is_inplace = !!(flags & AV_TX_INPLACE);
FFTXCodeletOptions sub_opts = { .invert_lookup = !is_inplace };
FFTXCodeletOptions sub_opts = {
.map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
};
flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
flags |= AV_TX_INPLACE; /* in-place */
@ -974,7 +961,9 @@ static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s,
sub_len, inv, scale)))
return ret;
if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
/* Generate PFA map */
if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
cd->factors[0], sub_len)))
return ret;
if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
@ -1128,7 +1117,9 @@ static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s,
const void *scale)
{
int ret;
FFTXCodeletOptions sub_opts = { .invert_lookup = inv };
FFTXCodeletOptions sub_opts = {
.map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
};
s->scale_d = *((SCALE_TYPE *)scale);
s->scale_f = s->scale_d;
@ -1328,7 +1319,7 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
const void *scale)
{
int ret, sub_len;
FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
len >>= 1;
sub_len = len / cd->factors[0];
@ -1344,9 +1335,13 @@ static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
sub_len, inv, scale)))
return ret;
if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
return ret;
/* Our 15-point transform is also a compound one, so embed its input map */
if (cd->factors[0] == 15)
TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
return ret;

@ -75,12 +75,11 @@ static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
int len, int inv, \
const void *scale) \
{ \
const int inv_lookup = opts ? opts->invert_lookup : 1; \
ff_tx_init_tabs_float(len); \
if (cd->max_len == 2) \
return ff_tx_gen_ptwo_revtab(s, inv_lookup); \
return ff_tx_gen_ptwo_revtab(s, opts); \
else \
return ff_tx_gen_split_radix_parity_revtab(s, len, inv, inv_lookup, \
return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, \
basis, interleave); \
}
@ -91,27 +90,27 @@ static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd,
uint64_t flags, FFTXCodeletOptions *opts,
int len, int inv, const void *scale)
{
int ret;
/* The transformations below are performed in the gather domain,
* so override the option and let the infrastructure convert the map
* to SCATTER if needed. */
FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
TX_TAB(ff_tx_init_tabs)(len);
s->map = av_malloc(len*sizeof(s->map));
s->map[0] = 0; /* DC is always at the start */
if (inv) /* Reversing the ACs flips the transform direction */
for (int i = 1; i < len; i++)
s->map[i] = len - i;
if (len == 15)
ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5);
else
for (int i = 1; i < len; i++)
s->map[i] = i;
ret = ff_tx_gen_default_map(s, &sub_opts);
if (ret < 0)
return ret;
if (len == 15) {
int cnt = 0, tmp[15];
/* Our 15-point transform is actually a 5x3 PFA, so embed its input map. */
memcpy(tmp, s->map, 15*sizeof(*tmp));
for (int i = 0; i < 5; i++)
for (int j = 0; j < 3; j++)
s->map[i*3 + j] = tmp[(i*3 + j*5) % 15];
/* Special 15-point assembly permutation */
/* Special permutation to simplify loads in the pre-permuted version */
memcpy(tmp, s->map, 15*sizeof(*tmp));
for (int i = 1; i < 15; i += 3) {
s->map[cnt] = tmp[i];
@ -139,7 +138,7 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
int len, int inv, const void *scale)
{
int ret;
FFTXCodeletOptions sub_opts = { .invert_lookup = 1 };
FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
s->scale_d = *((SCALE_TYPE *)scale);
s->scale_f = s->scale_d;
@ -177,7 +176,7 @@ static av_cold int fft_pfa_init(AVTXContext *s,
{
int ret;
int sub_len = len / cd->factors[0];
FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
flags |= AV_TX_INPLACE; /* in-place */
@ -188,13 +187,18 @@ static av_cold int fft_pfa_init(AVTXContext *s,
sub_len, inv, scale)))
return ret;
if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
return ret;
if (cd->factors[0] == 15) {
int tmp[15];
/* Our 15-point transform is also a compound one, so embed its input map */
TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
/* Special permutation to simplify loads in the pre-permuted version */
for (int k = 0; k < s->sub[0].len; k++) {
int cnt = 0;
int tmp[15];
memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
for (int i = 1; i < 15; i += 3) {
s->map[k*15 + cnt] = tmp[i];

Loading…
Cancel
Save